diff --git a/Assignment Colab/.ipynb_checkpoints/ACE_class_NaiveBayes_Model-checkpoint.ipynb b/Assignment Colab/.ipynb_checkpoints/ACE_class_NaiveBayes_Model-checkpoint.ipynb new file mode 100644 index 0000000..13df2d9 --- /dev/null +++ b/Assignment Colab/.ipynb_checkpoints/ACE_class_NaiveBayes_Model-checkpoint.ipynb @@ -0,0 +1,345 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "
\n", + "Please follow these colored cells for direction\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" + }, + "outputs": [], + "source": [ + "# This Python 3 environment comes with many helpful analytics libraries installed\n", + "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n", + "# For example, here's several helpful packages to load in \n", + "\n", + "import numpy as np # linear algebra\n", + "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", + "\n", + "# Input data files are available in the \"../input/\" directory.\n", + "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", + "\n", + "import os\n", + "for dirname, _, filenames in os.walk('/kaggle/input'):\n", + " for filename in filenames:\n", + " print(os.path.join(dirname, filename))\n", + "\n", + "# Any results you write to the current directory are saved as output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", + "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" + }, + "outputs": [], + "source": [ + "#Set path of the file to read.\n", + "TrainSet_path = '/kaggle/input/ace-class-assignment/AMP_TrainSet.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Read the file into a variable TrainSet_data\n", + "TrainSet_data = pd.read_csv(TrainSet_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#List all columns in the dataset from which to choose variables for modeling\n", + "TrainSet_data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Use the dot notation to select the column to predict . Call it y\n", + "y = TrainSet_data.CLASS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "
\n", + "\n", + "## ??\n", + "\n", + "How did you arrive at these features?\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Create list of features to be used for prediction\n", + "feature_names = ['FULL_Charge', 'FULL_AcidicMolPerc', 'FULL_AURR980107',\n", + " 'FULL_DAYM780201', 'FULL_GEOR030101', 'FULL_OOBM850104', 'NT_EFC195',\n", + " 'AS_MeanAmphiMoment', 'AS_DAYM780201', 'AS_FUKS010112', 'CT_RACS820104']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Select data corresponding to features in feature_names\n", + "X = TrainSet_data[feature_names]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Quickly review the data for predictiction of CLASS house prices using the head method\n", + "X.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Import DecisionTreeRegressorsor from scikit-learn library\n", + "from sklearn.naive_bayes import GaussianNB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Define model. \n", + "data_model = GaussianNB()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Fit model\n", + "data_model.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Make Predictions for the first rows of the training data to see how the predict function works\n", + "print(\"Making predictions for the following 5 houses:\")\n", + "print(X.head())\n", + "print(\"The predictions are\")\n", + "print(data_model.predict(X.head()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "## ??\n", + "\n", + "You have less than 10 algorithms. Why?\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Calculate mean absolute error; one of many metrics to summarize model quality\n", + "from sklearn.metrics import mean_absolute_error\n", + "\n", + "predicted = data_model.predict(X)\n", + "mean_absolute_error(y, predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Create classification report containing various statistics required to judge the model\n", + "from sklearn import metrics\n", + "\n", + "print(metrics.classification_report(y, predicted))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Create a confusion matrix to give us a clear idea of the accuracy and fitting of the model\n", + "print(metrics.confusion_matrix(y, predicted))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Set path of the file to read.\n", + "Test_data_path = '/kaggle/input/ace-class-assignment/Test.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Read the file into a variable TrainSet_data\n", + "Test_data = pd.read_csv(Test_data_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Make Predictions for the first rows test data to see how the predict function works\n", + "print(\"Making predictions for test data:\")\n", + "print(Test_data.head())\n", + "print(\"The predictions are\")\n", + "print(data_model.predict(Test_data.head()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Make Predictions for the entire test data \n", + "print(\"Making predictions for test data:\")\n", + "print(Test_data)\n", + "print(\"The predictions are\")\n", + "print(data_model.predict(Test_data))\n", + "\n", + "test_preds = data_model.predict(Test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Save predictions in format used for competition scoring\n", + "output = pd.DataFrame({'CLASS': test_preds})\n", + "output_bool = output.astype(bool)\n", + "output_bool.to_csv('submission.csv', index_label='Index')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "## This notebook is not good.\n", + "\n", + "- please check with your class members on how to make it better.\n", + "- we need to see more commenting, why are you doing what you are doing?\n", + "- this will fetch you very poor marks.\n", + "\n", + "\n", + "
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Assignment Colab/ACE_Class_aggregatedML.ipynb b/Assignment Colab/ACE_Class_aggregatedML.ipynb new file mode 100644 index 0000000..be26697 --- /dev/null +++ b/Assignment Colab/ACE_Class_aggregatedML.ipynb @@ -0,0 +1 @@ +{"cells":[{"metadata":{},"cell_type":"markdown","source":"

**Introduction: Basic Machine Learning Project**

\n

In this notebook, we will implement a basic machine learning project. We will go through the entire machine learning process, cleaning the data, exploring it to find trends, establishing a baseline model and evaluating several machine learning approaches for comparisons. Let's get started!

\n

Dataset

\n

The objective is to predict CLASS from the other variables which makes this a supervised, classification task. We have a set of training data with known labels, and we want the model to learn a mapping from the features (explanatory variables) to the target (the label) in this case the CLASS. It is a classification task because the CLASS is a binary variable

"},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load in \n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the \"../input/\" directory.\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# Remove unnecessary warnings\nimport warnings\nwarnings.filterwarnings('ignore')\n\n# Any results you write to the current directory are saved as output.","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Pandas and numpy for data manipulation\nimport pandas as pd\nimport numpy as np\n \n# Matplotlib and seaborn for plotting\nimport matplotlib.pyplot as plt\n%matplotlib inline\n\nimport matplotlib\nmatplotlib.rcParams['font.size'] = 16\nmatplotlib.rcParams['figure.figsize'] = (9, 9)\n\nimport seaborn as sns\n\nfrom IPython.core.pylabtools import figsize\n\n# Scipy helper functions\nfrom scipy.stats import percentileofscore\nfrom scipy import stats","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","trusted":true},"cell_type":"code","source":"#Set path of the file to read.\nTrainSet_path = '/kaggle/input/ace-class-assignment/AMP_TrainSet.csv'","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Read the file into a variable TrainSet_data\nTrainSet_data = pd.read_csv(TrainSet_path)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"TrainSet_data.columns","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"

Exploratory Data Analysis

\n

The first step in solving a data science problem (once you have cleaned data) is exploratory data analysis (EDA). This is an open-ended process where we look for anomalies, interesting trends or patterns, and correlations in a dataset. These may be interesting in their own right and they can inform our modeling. Basically, we use EDA to find out what our data can tell us!

"},{"metadata":{"trusted":true},"cell_type":"code","source":"\n#Let’s look at a snapshot of our data as a pandas dataframe\nTrainSet_data.head()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Check the shape of data\nTrainSet_data.shape","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Summary statistics of our data\nTrainSet_data.describe()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"There are a total of 3038 observations with 12 variables. The CLASS column is our target variable, which makes this a **supervised, classification** machine learning task. It’s **supervised** because we have a set of training data with known targets and, during training, we want our model to learn to predict the CLASS from the other variables. CLASS is a **binary variable** which makes this a **bi-classification** problem.

\nThe primary variable of interest is the CLASS, so let’s take a look at the distribution to check for skew\n"},{"metadata":{"trusted":true},"cell_type":"code","source":"# Histogram of CLASS\nplt.hist(TrainSet_data['CLASS'], bins = 14)\nplt.xlabel('CLASS')\nplt.ylabel('Count')\nplt.title('Distribution of CLASS')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"

PCA visualization

\n

As we know it is difficult to visualize the data with so many features i.e **high dimensional** data so we can use PCA to find the two **principal components** hence visualize the data in **two-dimensional space** with a single **scatter plot**. But, before that, we need to pre-process the data i.e we need to **scale the data** such that each feature has unit variance and has not a greater impact than the other one

\n"},{"metadata":{"trusted":true},"cell_type":"code","source":"#Drop Target column, CLASS \nhigh_dim = TrainSet_data.drop(columns='CLASS')\n\n#Scale the Data\nfrom sklearn.preprocessing import StandardScaler\nscaler = StandardScaler()\nscaler.fit(high_dim)\n\nscaled_data = scaler.transform(high_dim)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Specify number of components\nfrom sklearn.decomposition import PCA\npca = PCA(n_components=2)\npca.fit(scaled_data)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Transform the data to its first 2 principal components.\nx_pca = pca.transform(scaled_data)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Check the shape of data before PCA\n\nscaled_data.shape\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Check the shape of data after PCA\n\nx_pca.shape","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Let’s plot these two dimensions out\nplt.figure(figsize=(8,6))\nplt.scatter(x_pca[:,0],x_pca[:,1],c=TrainSet_data['CLASS'],cmap='rainbow')\nplt.xlabel('First Principal component')\nplt.ylabel('Second Principal Component')","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Interpreting the components\n#Not easy to understand these component reduction.The components correspond to combinations of the original features\n#The components themselves are stored as an attribute of the fitted PCA object:\n\npca.components_","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Visualize this using heatmap-\n\nmap= pd.DataFrame(pca.components_,columns=high_dim.columns)\nplt.figure(figsize=(12,6))\nsns.heatmap(map,cmap='twilight')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"This heatmap and the color bar basically represent the **correlation** between the various feature and the principal component itself.\nThis is useful when you are dealing with the high dimensional dataset.\n\n

Feature Selection

\n

We need to perform **feature selection (also called dimensionality reduction)** to choose only the “relevant” variables. This depends on the problem, but because we will be doing linear modeling in this project, we can use a simple measure called the **Correlation Coefficient** to determine the most useful variables for predicting a CLASS. This is a value between -1 and +1 that measures the direction and strength of a linear relationship between two variables.\n\nTo select a limited number of variables, we can find those that have the greatest correlation (either negative or positive) with the CLASS

"},{"metadata":{"trusted":true},"cell_type":"code","source":"#List all columns in the dataset from which to choose variables for modeling\nTrainSet_data.columns","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Find correlations and sort\nTrainSet_data.corr()['CLASS'].sort_values()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In this problem we will use these results to perform feature selection by retaining only the 8 variables that are most highly correlated with the CLASS. 8 is sort of an arbitrary number that I found works well in the model, which shows that a lot of machine learning is just experimentation!\n\nWhile we are performing feature selection, we also split the data into a training and testing set using a Scikit learn function. This is necessary because we need to have a hold-out test set to evaluate our model and make sure it is not overfitting to the testing data:"},{"metadata":{"trusted":true},"cell_type":"code","source":"# Takes in a dataframe, finds the most correlated variables with CLASS and returns training and testing datasets\ndef format_data(TrainSet_data):\n # Use the dot notation to select the column to predict . Call it y\n y = TrainSet_data.CLASS\n \n # Find correlations with CLASS\n most_correlated = TrainSet_data.corr().abs()['CLASS'].sort_values(ascending=False)\n \n # Maintain the top 8 most correlation features with CLASS\n most_correlated = most_correlated[:8]\n \n # X is features with top 8 most correlation with CLASS\n X = TrainSet_data.loc[:, most_correlated.index]\n \n #Import train_test_split from scikit-learn library\n from sklearn.model_selection import train_test_split\n \n # Split into training/testing sets with 25% split\n X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,random_state=42)\n \n return X_train, X_test, y_train, y_test","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Snapshot of X_train dataset\nX_train, X_test, y_train, y_test = format_data(TrainSet_data)\nX_train.head()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"#Print shape of X_train and X_test\nprint(X_train.shape)\nprint(X_test.shape)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"This leaves us with 2278 training observations and 760 testing data points.\n\n

Examine Selected Features

\n

Pairs Plot** is great for showing both distribution of variables and relations between pairs of variables. Here we use the **seaborn PairGrid function** to show a Pairs Plot for the selected features

"},{"metadata":{"trusted":true},"cell_type":"code","source":"# Calculate correlation coefficient\ndef corrfunc(x, y, **kws):\n r, _ = stats.pearsonr(x, y)\n ax = plt.gca()\n ax.annotate(\"r = {:.2f}\".format(r),\n xy=(.1, .6), xycoords=ax.transAxes,\n size = 24)\n \ncmap = sns.cubehelix_palette(light=1, dark = 0.1,\n hue = 0.5, as_cmap=True)\n\nsns.set_context(font_scale=2)\n\n# Pair grid set up\ng = sns.PairGrid(X_train)\n\n# Scatter plot on the upper triangle\ng.map_upper(plt.scatter, s=10, color = 'red')\n\n# Distribution on the diagonal\ng.map_diag(sns.distplot, kde=False, color = 'red')\n\n# Density Plot and Correlation coefficients on the lower triangle\ng.map_lower(sns.kdeplot, cmap = cmap)\ng.map_lower(corrfunc);","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"There is a lot of information encoded in this plot! On the upper triangle, we have scatterplots of every variable plotted against one another. Notice that most variables are continuous integers, meaning they takes on values in some interval of numbers. \nOn the diagonal, we have histograms showing the distribution of a single variable. The lower right has both 2-D density plots and the correlation coefficient between variables.\nTo interpret the plot, we can select a variable and look at the row and column to find the relationships with all the other variables. For example, the first row shows the scatterplots of CLASS , our target, with the other variables. The first column shows the correlation coefficient between CLASS and the other variables. We see that **AS_MeanAmphiMoment** has the greatest correlation with CLASS in terms of absolute magnitude.\n\n

Selected Variables Distribution by Relation to Median

\n\n

As another exploration of the selected data, we can make distribution plots of each variable, coloring the plot by if the CLASS is above the median score of 0. To make these plot, we create a column in our dataframe comparing CLASS to 0 and then plot all the values in density plots

\n"},{"metadata":{"trusted":true},"cell_type":"code","source":"# Create relation to the median CLASS column\nX_plot = X_train.copy()\nX_plot['relation_median'] = (X_plot['CLASS'] <= 0)\nX_plot['relation_median'] = X_plot['relation_median'].replace({True: 'below', False: 'above'})\nX_plot = X_plot.drop(columns='CLASS')","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12, 12))\n# Plot the distribution of each variable colored by the relation to the median CLASS\nfor i, col in enumerate(X_plot.columns[:-1]):\n plt.subplot(4, 2, i + 1)\n subset_above = X_plot[X_plot['relation_median'] == 'above']\n subset_below = X_plot[X_plot['relation_median'] == 'below']\n sns.kdeplot(subset_above[col], label = 'Above Median', color = 'green')\n sns.kdeplot(subset_below[col], label = 'Equal/Below Median', color = 'red')\n plt.legend(); plt.title('Distribution of %s' % col)\n \nplt.tight_layout()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"The green distributions represent CLASS at or above the median, and the red is below. We can see that some variables are more positively correlated with CLASS, while others show negative correlation with CLASS.\n\nThe EDA has given us a good sense of our dataset. We made figures, found relationships between variables, and used these to perform feature selection to retain only the variables most relevant for our task. While EDA is a precursor to modeling, it’s also useful on its own, and many data science problems can be solved solely through the plots and statistics we made here.\n\n

Establish Benchmarks

\n

Metrics

\n\n

One of the most overlooked aspects of the machine learning pipeline is establishing a baseline. Yes, it might look impressive if your classification model achieves 99% accuracy, but what if we could get 98% accuracy just by guessing the same class every time? Would we really want to spend our time building a model for that problem? A good baseline allows us to assess whether or not our model (or any model) is applicable to the task.\n\nFor regression, a good naive baseline is simply to guess the median value of the target for every observation in the test data. In our problem, the median is 0, so let’s assess the accuracy of a model that naively predicts 0 for every entry on the test set. We will use 2 metrics to evaluate predictions:\n* Mean Absolute Error (MAE): The average of the absolute value of the differences between the predictions and true values.\n* Root Mean Squared Error (RMSE): The square root of the average of the squared differences between the predictions and true values.\n\nThe mean absolute error is easily interpretable, as it represents how far off we are on average from the correct value. The root mean squared error penalizes larger errors more heavily and is commonly used in regression tasks. Either metric may be appropriate depending on the situation and we will use both for comparison

"},{"metadata":{"trusted":true},"cell_type":"code","source":"# Calculate mae and rmse\ndef evaluate_predictions(predictions, true):\n mae = np.mean(abs(predictions - true))\n rmse = np.sqrt(np.mean((predictions - true) ** 2))\n \n return mae, rmse","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"\n

Naive Baseline

\n\n

For a such task, a simple naive baseline is to guess the median value on the training set for all testing cases. If our machine learning model cannot better this simple baseline, then perhaps we should try a different approach!

"},{"metadata":{"trusted":true},"cell_type":"code","source":"# Naive baseline is the median\nmedian_pred = X_train['CLASS'].median()\nmedian_preds = [median_pred for _ in range(len(X_test))]\ntrue = X_test['CLASS']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Display the naive baseline metrics\nfrom sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error\n\nmb_mae, mb_rmse = evaluate_predictions(median_preds, true)\nprint('Median Baseline MAE: {:.4f}'.format(mb_mae))\nprint('Median Baseline RMSE: {:.4f}'.format(mb_rmse))","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"If our machine learning model cannot beat these metrics, then we either need to get more data, try another approach, or conclude that machine learning is not applicable to our problem!\n\n

Standard Machine Learning Models

\n\n

It's helpful to compare results from standard techniques such as Linear Regression, Support Vector Machines, or tree-based methods applicable to this task. We will evaluate several of these methods on our dataset. Luckily, these are all implementable with Python libraries such as Scikit-Learn.

"},{"metadata":{"trusted":true},"cell_type":"code","source":"\n# Standard ML Models for comparison\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.linear_model import ElasticNet\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.ensemble import ExtraTreesRegressor\nfrom sklearn.ensemble import GradientBoostingRegressor\nfrom sklearn.svm import SVR\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.naive_bayes import GaussianNB\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.ensemble import GradientBoostingClassifier\nfrom sklearn.svm import SVC\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import AdaBoostClassifier\nfrom xgboost import XGBClassifier\n\n# Splitting data into training/testing\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import MinMaxScaler\n\n# Metrics\nfrom sklearn import metrics\nfrom sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error\n\n\n# Distributions\nimport scipy","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Evaluate several ml models by training on training set and testing on testing set\ndef evaluate(X_train, X_test, y_train, y_test):\n # Names of models\n model_name_list = ['Linear Regression', 'ElasticNet Regression',\n 'Random Forest Regressor', 'Extra Trees Regressor', \n 'SVM Regressor', 'Gradient Boosted Regressor', \n 'Decision Tree classifier', 'Naives Bayes classifier', \n 'GradientBoosting classifier', 'Random Forest Classifier',\n 'SVM classifier', 'LogisticRegression', 'AdaBoostClassifier', \n 'XGBClassifier', 'Baseline']\n \n X_train = X_train.drop(columns='CLASS')\n X_test = X_test.drop(columns='CLASS')\n \n # Instantiate the models\n model1 = LinearRegression()\n model2 = ElasticNet(alpha=1.0, l1_ratio=0.5)\n model3 = RandomForestRegressor(n_estimators=50)\n model4 = ExtraTreesRegressor(n_estimators=50)\n model5 = SVR(kernel='linear', degree=3, C=1.0, gamma='auto')\n model6 = GradientBoostingRegressor(n_estimators=20)\n model7 = DecisionTreeClassifier(random_state=42)\n model8 = GaussianNB()\n model9 = RandomForestClassifier(random_state=42)\n model10 = GradientBoostingClassifier(random_state=42)\n model11 = SVC(kernel='linear', degree=3, C=1.0, gamma='auto')\n model12 = LogisticRegression(max_iter=200)\n model13 = AdaBoostClassifier(random_state=42)\n model14 = XGBClassifier(random_state=42)\n \n # Dataframe for results\n results = pd.DataFrame(columns=['mae', 'rmse'], index = model_name_list)\n \n \n # Train and predict with each model\n for i, model in enumerate([model1, model2, model3, model4, model5, model6, \n model7, model8, model9, model10, model11, model12, model13, model14]):\n model.fit(X_train, y_train)\n predictions = model.predict(X_test)\n \n # Metrics\n mae = np.mean(abs(predictions - y_test))\n rmse = np.sqrt(np.mean((predictions - y_test) ** 2))\n \n # Insert results into the dataframe\n model_name = model_name_list[i]\n results.loc[model_name, :] = [mae, rmse]\n \n # Median Value Baseline Metrics\n baseline = np.median(y_train)\n baseline_mae = np.mean(abs(baseline - y_test))\n baseline_rmse = np.sqrt(np.mean((baseline - y_test) ** 2))\n \n results.loc['Baseline', :] = [baseline_mae, baseline_rmse]\n \n return results","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"results = evaluate(X_train, X_test, y_train, y_test)\n","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"

Visual Comparison of Models by Model Mean Absolute Error & Model Root Mean Squared Error

"},{"metadata":{"trusted":true},"cell_type":"code","source":"matplotlib.rcParams['figure.figsize'] = (12, 8)\nmatplotlib.rcParams['font.size'] = 16\n# Root mean squared error\nax = plt.subplot(1, 2, 1)\nresults.sort_values('mae', ascending = True).plot.bar(y = 'mae', color = 'b', ax = ax)\nplt.title('Model Mean Absolute Error'); plt.ylabel('MAE');\n\n# Median absolute percentage error\nax = plt.subplot(1, 2, 2)\nresults.sort_values('rmse', ascending = True).plot.bar(y = 'rmse', color = 'r', ax = ax)\nplt.title('Model Root Mean Squared Error'); plt.ylabel('RMSE');\n\nplt.tight_layout()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Model Mean Absolute Error & Model Root Mean Squared Error\nresults","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"print('The Random Forest Classifier is {:0.2f}% better than the baseline.'.format(\n (100 * abs(results.loc['Random Forest Classifier', 'mae'] - results.loc['Baseline', 'mae'])) / results.loc['Baseline', 'mae']))","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Fortunately, we see that all models best the baseline indicating that machine learning will work for this problem. Overall, the Random Forest regression method performs the best. \n\n

Visualizing Decision Trees

\n

Here, we are using Scikit-learn's **export_graphviz** function for display the tree. It converts random forest classifier into dot file

"},{"metadata":{"trusted":true},"cell_type":"code","source":"#List all columns in the dataset from which to choose variables for modeling\nX_train.columns\n\n#Drop CLASS to remain with only selected features to be used for prediction\ndata= X_train.drop(columns='CLASS')\ndata.columns\n\n#Create list of selected features to be used for prediction\nfeature_names = ['AS_MeanAmphiMoment', 'FULL_AcidicMolPerc', 'FULL_AURR980107',\n 'FULL_DAYM780201', 'FULL_Charge', 'FULL_OOBM850104', 'AS_DAYM780201']\n\n#Use the dot notation to select the column to predict . Call it target\ntarget = X_train.CLASS","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"\n# Model\nfrom sklearn.ensemble import RandomForestClassifier\nmodel = RandomForestClassifier(random_state=42)\n\n# Train\nmodel.fit(data, target)\n\nestimator = model.estimators_[5]\n\nfrom sklearn.tree import export_graphviz\n\n# Export as dot file\nexport_graphviz(estimator, out_file='tree.dot', \n feature_names = ['AS_MeanAmphiMoment', 'FULL_AcidicMolPerc', 'FULL_AURR980107',\n 'FULL_DAYM780201', 'FULL_Charge', 'FULL_OOBM850104', 'AS_DAYM780201'],\n class_names = ['0', '1'],\n rounded = True, proportion = False, \n precision = 2, filled = True)\n\n# Convert to png\nfrom subprocess import call\ncall(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])\n\n# Display\nplt.figure(figsize = (100, 80))\nplt.imshow(plt.imread('tree.png'))\nplt.axis('off');\nplt.show();","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In the decision tree chart, each internal node has a decision rule that splits the data. Gini referred as Gini ratio, which measures the impurity of the node. You can say a node is pure when all of its records belong to the same class, such nodes known as the leaf node.\n\nHere, the resultant tree is unpruned. This unpruned tree is unexplainable and not easy to understand. So, let's optimize it by pruning.\n\n

Optimizing Decision Tree Performance

\n

In Scikit-learn, optimization of decision tree is mostly performed by pre-pruning. Maximum depth of the tree can be used as a control variable for pre-pruning. Here, we plot a decision tree on the same data with max_depth=3. Other than pre-pruning parameters, we can also try other attribute selection measure such as entropy.

"},{"metadata":{"trusted":true},"cell_type":"code","source":"\n# Model\nmodel = RandomForestClassifier(criterion=\"entropy\", max_depth=3, random_state=42)\n\n# Train\nmodel.fit(data, target)\n# Extract single tree\nestimator = model.estimators_[5]\n\nfrom sklearn.tree import export_graphviz\n# Export as dot file\nexport_graphviz(estimator, out_file='tree.dot', \n feature_names = ['AS_MeanAmphiMoment', 'FULL_AcidicMolPerc', 'FULL_AURR980107',\n 'FULL_DAYM780201', 'FULL_Charge', 'FULL_OOBM850104', 'AS_DAYM780201'],\n class_names = ['0', '1'],\n rounded = True, proportion = False, \n precision = 2, filled = True)\n\n# Convert to png\nfrom subprocess import call\ncall(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])\n\n# Display in python\nimport matplotlib.pyplot as plt\nplt.figure(figsize = (100, 80))\nplt.imshow(plt.imread('tree.png'))\nplt.axis('off');\nplt.show();","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"This pruned model is less complex, explainable, and easy to understand than the previous decision tree model plot\n\n

Conclusions

\n\n

While machine learning gets all the attention, it often comprises a small part of a data science project. Most of the work — and most of the value — comes in obtaining, cleaning, and exploring the data. Only once we have a firm grasp on the structure of our data and the relationships within it should we proceed to building machine learning models. I wanted to show the entire process in for this project to demonstrate a typical data science workflow. \nIn this project we:\n* Explored the data to find interesting patterns, trends, or anomalies\n* Examined correlations between the features and the target\n* Performed feature selection using correlation values\n* Established a baseline and benchmarked machine learning models

"},{"metadata":{"trusted":true},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat":4,"nbformat_minor":4} \ No newline at end of file diff --git a/Assignment Colab/ACE_class_NaiveBayes_Model.ipynb b/Assignment Colab/ACE_class_NaiveBayes_Model.ipynb new file mode 100644 index 0000000..13df2d9 --- /dev/null +++ b/Assignment Colab/ACE_class_NaiveBayes_Model.ipynb @@ -0,0 +1,345 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "
\n", + "Please follow these colored cells for direction\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" + }, + "outputs": [], + "source": [ + "# This Python 3 environment comes with many helpful analytics libraries installed\n", + "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n", + "# For example, here's several helpful packages to load in \n", + "\n", + "import numpy as np # linear algebra\n", + "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", + "\n", + "# Input data files are available in the \"../input/\" directory.\n", + "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", + "\n", + "import os\n", + "for dirname, _, filenames in os.walk('/kaggle/input'):\n", + " for filename in filenames:\n", + " print(os.path.join(dirname, filename))\n", + "\n", + "# Any results you write to the current directory are saved as output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", + "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" + }, + "outputs": [], + "source": [ + "#Set path of the file to read.\n", + "TrainSet_path = '/kaggle/input/ace-class-assignment/AMP_TrainSet.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Read the file into a variable TrainSet_data\n", + "TrainSet_data = pd.read_csv(TrainSet_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#List all columns in the dataset from which to choose variables for modeling\n", + "TrainSet_data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Use the dot notation to select the column to predict . Call it y\n", + "y = TrainSet_data.CLASS" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "
\n", + "\n", + "## ??\n", + "\n", + "How did you arrive at these features?\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Create list of features to be used for prediction\n", + "feature_names = ['FULL_Charge', 'FULL_AcidicMolPerc', 'FULL_AURR980107',\n", + " 'FULL_DAYM780201', 'FULL_GEOR030101', 'FULL_OOBM850104', 'NT_EFC195',\n", + " 'AS_MeanAmphiMoment', 'AS_DAYM780201', 'AS_FUKS010112', 'CT_RACS820104']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Select data corresponding to features in feature_names\n", + "X = TrainSet_data[feature_names]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Quickly review the data for predictiction of CLASS house prices using the head method\n", + "X.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Import DecisionTreeRegressorsor from scikit-learn library\n", + "from sklearn.naive_bayes import GaussianNB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Define model. \n", + "data_model = GaussianNB()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Fit model\n", + "data_model.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Make Predictions for the first rows of the training data to see how the predict function works\n", + "print(\"Making predictions for the following 5 houses:\")\n", + "print(X.head())\n", + "print(\"The predictions are\")\n", + "print(data_model.predict(X.head()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "## ??\n", + "\n", + "You have less than 10 algorithms. Why?\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Calculate mean absolute error; one of many metrics to summarize model quality\n", + "from sklearn.metrics import mean_absolute_error\n", + "\n", + "predicted = data_model.predict(X)\n", + "mean_absolute_error(y, predicted)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Create classification report containing various statistics required to judge the model\n", + "from sklearn import metrics\n", + "\n", + "print(metrics.classification_report(y, predicted))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Create a confusion matrix to give us a clear idea of the accuracy and fitting of the model\n", + "print(metrics.confusion_matrix(y, predicted))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Set path of the file to read.\n", + "Test_data_path = '/kaggle/input/ace-class-assignment/Test.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Read the file into a variable TrainSet_data\n", + "Test_data = pd.read_csv(Test_data_path)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Make Predictions for the first rows test data to see how the predict function works\n", + "print(\"Making predictions for test data:\")\n", + "print(Test_data.head())\n", + "print(\"The predictions are\")\n", + "print(data_model.predict(Test_data.head()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Make Predictions for the entire test data \n", + "print(\"Making predictions for test data:\")\n", + "print(Test_data)\n", + "print(\"The predictions are\")\n", + "print(data_model.predict(Test_data))\n", + "\n", + "test_preds = data_model.predict(Test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Save predictions in format used for competition scoring\n", + "output = pd.DataFrame({'CLASS': test_preds})\n", + "output_bool = output.astype(bool)\n", + "output_bool.to_csv('submission.csv', index_label='Index')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "\n", + "## This notebook is not good.\n", + "\n", + "- please check with your class members on how to make it better.\n", + "- we need to see more commenting, why are you doing what you are doing?\n", + "- this will fetch you very poor marks.\n", + "\n", + "\n", + "
" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}