diff --git a/.gitignore b/.gitignore
index 9c3723f0..458278d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,49 @@
datasets
PythonCode/intermediate_datafiles/
PythonCode/Example_graphs/
+weather_steps.csv
+data_used/StepCount.csv
+exportxml/ActiveEnergyBurned.csv
+.gitignore
+exportxml/ActivitySummary.csv
+exportxml/AppleExerciseTime.csv
+exportxml/AppleStandHour.csv
+exportxml/AppleStandTime.csv
+exportxml/BasalEnergyBurned.csv
+exportxml/DietaryWater.csv
+exportxml/DistanceCycling.csv
+exportxml/DistanceWalkingRunning.csv
+exportxml/EnvironmentalAudioExposure.csv
+exportxml/EnvironmentalSoundReduction.csv
+exportxml/FlightsClimbed.csv
+exportxml/HeadphoneAudioExposure.csv
+exportxml/HeartRate.csv
+exportxml/HeartRateVariabilitySDNN.csv
+exportxml/README.md
+exportxml/RespiratoryRate.csv
+exportxml/RestingHeartRate.csv
+exportxml/SleepAnalysis.csv
+exportxml/StairAscentSpeed.csv
+exportxml/StairDescentSpeed.csv
+exportxml/StepCount.csv
+exportxml/WalkingAsymmetryPercentage.csv
+exportxml/WalkingDoubleSupportPercentage.csv
+exportxml/WalkingHeartRateAverage.csv
+exportxml/WalkingSpeed.csv
+exportxml/WalkingStepLength.csv
+exportxml/Workout.csv
+.gitignore
+Missing_data/missing_data_transformation.ipynb
+Missing_data/missing_data_transfromation.py
+combine_data.py
+export.xml
+README.md
+data_used/weather.txt
+README.md
+Missing_data/weather_steps_merged.csv
+Missing_data/New_weather_steps.csv
+Missing_data/transformation_notgood.py
+New_weather_steps.csv
+Steps_weather_combined.csv
+Missing_data/EDA_old.ipynb
+Transformed_weather_steps.csv
diff --git a/Missing_data/apply_interpolation.ipynb b/Missing_data/apply_interpolation.ipynb
new file mode 100644
index 00000000..b23d3c27
--- /dev/null
+++ b/Missing_data/apply_interpolation.ipynb
@@ -0,0 +1,340 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start | \n",
+ " steps | \n",
+ " combined | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-01 00:10:00 | \n",
+ " 97.054054 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-01 00:20:00 | \n",
+ " 94.945946 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-01 00:30:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-01 00:40:00 | \n",
+ " 46.876667 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 11647 | \n",
+ " 2023-06-05 21:10:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 11648 | \n",
+ " 2023-06-05 21:20:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 11649 | \n",
+ " 2023-06-05 21:30:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 11650 | \n",
+ " 2023-06-05 21:40:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 11651 | \n",
+ " 2023-06-05 21:50:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
11652 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start steps combined\n",
+ "0 2023-01-01 00:00:00 0.000000 5.0\n",
+ "1 2023-01-01 00:10:00 97.054054 5.0\n",
+ "2 2023-01-01 00:20:00 94.945946 5.0\n",
+ "3 2023-01-01 00:30:00 0.000000 5.0\n",
+ "4 2023-01-01 00:40:00 46.876667 5.0\n",
+ "... ... ... ...\n",
+ "11647 2023-06-05 21:10:00 0.000000 5.0\n",
+ "11648 2023-06-05 21:20:00 0.000000 5.0\n",
+ "11649 2023-06-05 21:30:00 0.000000 5.0\n",
+ "11650 2023-06-05 21:40:00 0.000000 5.0\n",
+ "11651 2023-06-05 21:50:00 0.000000 5.0\n",
+ "\n",
+ "[11652 rows x 3 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "New_weather_steps = pd.read_csv(\"Transformed_weather_steps.csv\")\n",
+ "New_weather_steps = New_weather_steps.drop(\"Unnamed: 0\",axis=1)\n",
+ "display(New_weather_steps)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start | \n",
+ " steps | \n",
+ " combined | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-01 00:10:00 | \n",
+ " 97.054054 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-01 00:20:00 | \n",
+ " 94.945946 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-01 00:30:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-01 00:40:00 | \n",
+ " 46.876667 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2023-01-01 00:50:00 | \n",
+ " 126.326232 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 2023-01-01 02:10:00 | \n",
+ " 117.962963 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 2023-01-01 02:20:00 | \n",
+ " 134.037037 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 2023-01-01 02:30:00 | \n",
+ " 113.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 2023-01-01 02:40:00 | \n",
+ " 11.586919 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 2023-01-01 02:50:00 | \n",
+ " 109.013415 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 2023-01-01 06:00:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 2023-01-01 06:10:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 2023-01-01 06:20:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start steps combined\n",
+ "0 2023-01-01 00:00:00 0.000000 5.0\n",
+ "1 2023-01-01 00:10:00 97.054054 5.0\n",
+ "2 2023-01-01 00:20:00 94.945946 5.0\n",
+ "3 2023-01-01 00:30:00 0.000000 5.0\n",
+ "4 2023-01-01 00:40:00 46.876667 5.0\n",
+ "5 2023-01-01 00:50:00 126.326232 5.0\n",
+ "6 2023-01-01 02:00:00 0.000000 5.0\n",
+ "7 2023-01-01 02:10:00 117.962963 5.0\n",
+ "8 2023-01-01 02:20:00 134.037037 5.0\n",
+ "9 2023-01-01 02:30:00 113.000000 5.0\n",
+ "10 2023-01-01 02:40:00 11.586919 5.0\n",
+ "11 2023-01-01 02:50:00 109.013415 5.0\n",
+ "12 2023-01-01 06:00:00 0.000000 5.0\n",
+ "13 2023-01-01 06:10:00 0.000000 5.0\n",
+ "14 2023-01-01 06:20:00 0.000000 5.0"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "New_weather_steps['steps'] = New_weather_steps['steps'].interpolate()\n",
+ "display(New_weather_steps.iloc[:15])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "start 0\n",
+ "steps 0\n",
+ "combined 0\n",
+ "dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(New_weather_steps.isna().sum())"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Missing_data/apply_interpolation.py b/Missing_data/apply_interpolation.py
new file mode 100644
index 00000000..9bc549ae
--- /dev/null
+++ b/Missing_data/apply_interpolation.py
@@ -0,0 +1,10 @@
+import pandas as pd
+import numpy as np
+
+#import data
+New_weather_steps = pd.read_csv("Transformed_weather_steps.csv")
+New_weather_steps = New_weather_steps.drop("Unnamed: 0",axis=1)
+
+#interpolate
+New_weather_steps['steps'] = New_weather_steps['steps'].interpolate()
+
diff --git a/Missing_data/apply_kalman_filter.ipynb b/Missing_data/apply_kalman_filter.ipynb
new file mode 100644
index 00000000..89aec545
--- /dev/null
+++ b/Missing_data/apply_kalman_filter.ipynb
@@ -0,0 +1,587 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from pykalman import KalmanFilter"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "New_weather_steps = pd.read_csv(\"Transformed_weather_steps.csv\")\n",
+ "New_weather_steps = New_weather_steps.drop(\"Unnamed: 0\",axis=1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Implements the Kalman filter for single columns.\n",
+ "def apply_kalman_filter(data_table, col):\n",
+ " # Initialize the Kalman filter with the trivial transition and observation matrices.\n",
+ " kf = KalmanFilter(transition_matrices=[[1]], observation_matrices=[[1]])\n",
+ "\n",
+ " numpy_array_state = data_table[col].values\n",
+ " numpy_array_state = numpy_array_state.astype(np.float32)\n",
+ " numpy_matrix_state_with_mask = np.ma.masked_invalid(numpy_array_state)\n",
+ "\n",
+ " # Find the best other parameters based on the data (e.g. Q)\n",
+ " kf = kf.em(numpy_matrix_state_with_mask, n_iter=5)\n",
+ "\n",
+ " # And apply the filter.\n",
+ " (new_data, filtered_state_covariances) = kf.filter(numpy_matrix_state_with_mask)\n",
+ "\n",
+ " data_table[col + '_kalman'] = new_data\n",
+ " return(data_table)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start | \n",
+ " end | \n",
+ " steps | \n",
+ " start_weather | \n",
+ " end_weather | \n",
+ " # STN | \n",
+ " direction_wind | \n",
+ " windspeed_avg_hour | \n",
+ " windspeed_avg_10min | \n",
+ " max_wind_gust | \n",
+ " ... | \n",
+ " cloud_cover | \n",
+ " relative_humidity | \n",
+ " weather_code | \n",
+ " indicator_present_weather_code | \n",
+ " fog | \n",
+ " rain | \n",
+ " snow | \n",
+ " thunder | \n",
+ " ice_formation | \n",
+ " steps_kalman | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 2023-01-01 00:10:00 | \n",
+ " 0.000000 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 215 | \n",
+ " 220.0 | \n",
+ " 110.0 | \n",
+ " 130.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " 8.0 | \n",
+ " 68.0 | \n",
+ " NaN | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 20.465898 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-01 00:10:00 | \n",
+ " 2023-01-01 00:20:00 | \n",
+ " 97.054054 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 215 | \n",
+ " 220.0 | \n",
+ " 110.0 | \n",
+ " 130.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " 8.0 | \n",
+ " 68.0 | \n",
+ " NaN | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 50.579339 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-01 00:20:00 | \n",
+ " 2023-01-01 00:30:00 | \n",
+ " 94.945946 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 215 | \n",
+ " 220.0 | \n",
+ " 110.0 | \n",
+ " 130.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " 8.0 | \n",
+ " 68.0 | \n",
+ " NaN | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 73.204683 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-01 00:30:00 | \n",
+ " 2023-01-01 00:40:00 | \n",
+ " 0.000000 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 215 | \n",
+ " 220.0 | \n",
+ " 110.0 | \n",
+ " 130.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " 8.0 | \n",
+ " 68.0 | \n",
+ " NaN | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 33.931264 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-01 00:40:00 | \n",
+ " 2023-01-01 00:50:00 | \n",
+ " 46.876667 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 215 | \n",
+ " 220.0 | \n",
+ " 110.0 | \n",
+ " 130.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " 8.0 | \n",
+ " 68.0 | \n",
+ " NaN | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 40.949190 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2023-01-01 00:50:00 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 126.326232 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 215 | \n",
+ " 220.0 | \n",
+ " 110.0 | \n",
+ " 130.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " 8.0 | \n",
+ " 68.0 | \n",
+ " NaN | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 87.334058 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 2023-01-01 01:10:00 | \n",
+ " 128.406776 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 285 | \n",
+ " 220.0 | \n",
+ " 160.0 | \n",
+ " 150.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 109.658711 | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 2023-01-01 01:10:00 | \n",
+ " 2023-01-01 01:20:00 | \n",
+ " 26.821821 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 285 | \n",
+ " 220.0 | \n",
+ " 160.0 | \n",
+ " 150.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 64.629329 | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 2023-01-01 01:20:00 | \n",
+ " 2023-01-01 01:30:00 | \n",
+ " 0.000000 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 285 | \n",
+ " 220.0 | \n",
+ " 160.0 | \n",
+ " 150.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 29.496724 | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 2023-01-01 01:30:00 | \n",
+ " 2023-01-01 01:40:00 | \n",
+ " 94.514644 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 285 | \n",
+ " 220.0 | \n",
+ " 160.0 | \n",
+ " 150.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 64.840710 | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 2023-01-01 01:40:00 | \n",
+ " 2023-01-01 01:50:00 | \n",
+ " 83.035725 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 285 | \n",
+ " 220.0 | \n",
+ " 160.0 | \n",
+ " 150.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 74.731600 | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 2023-01-01 01:50:00 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 1.945755 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 285 | \n",
+ " 220.0 | \n",
+ " 160.0 | \n",
+ " 150.0 | \n",
+ " 200.0 | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 35.164897 | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 2023-01-01 02:10:00 | \n",
+ " 0.000000 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 2023-01-01 03:00:00 | \n",
+ " 310 | \n",
+ " 210.0 | \n",
+ " 140.0 | \n",
+ " 130.0 | \n",
+ " 170.0 | \n",
+ " ... | \n",
+ " 8.0 | \n",
+ " 72.0 | \n",
+ " NaN | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 16.049105 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 2023-01-01 02:10:00 | \n",
+ " 2023-01-01 02:20:00 | \n",
+ " 117.962963 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 2023-01-01 03:00:00 | \n",
+ " 310 | \n",
+ " 210.0 | \n",
+ " 140.0 | \n",
+ " 130.0 | \n",
+ " 170.0 | \n",
+ " ... | \n",
+ " 8.0 | \n",
+ " 72.0 | \n",
+ " NaN | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 71.449923 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 2023-01-01 02:20:00 | \n",
+ " 2023-01-01 02:30:00 | \n",
+ " 134.037037 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 2023-01-01 03:00:00 | \n",
+ " 310 | \n",
+ " 210.0 | \n",
+ " 140.0 | \n",
+ " 130.0 | \n",
+ " 170.0 | \n",
+ " ... | \n",
+ " 8.0 | \n",
+ " 72.0 | \n",
+ " NaN | \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 105.472550 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
15 rows × 29 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start end steps start_weather \\\n",
+ "0 2023-01-01 00:00:00 2023-01-01 00:10:00 0.000000 2023-01-01 00:00:00 \n",
+ "1 2023-01-01 00:10:00 2023-01-01 00:20:00 97.054054 2023-01-01 00:00:00 \n",
+ "2 2023-01-01 00:20:00 2023-01-01 00:30:00 94.945946 2023-01-01 00:00:00 \n",
+ "3 2023-01-01 00:30:00 2023-01-01 00:40:00 0.000000 2023-01-01 00:00:00 \n",
+ "4 2023-01-01 00:40:00 2023-01-01 00:50:00 46.876667 2023-01-01 00:00:00 \n",
+ "5 2023-01-01 00:50:00 2023-01-01 01:00:00 126.326232 2023-01-01 00:00:00 \n",
+ "6 2023-01-01 01:00:00 2023-01-01 01:10:00 128.406776 2023-01-01 01:00:00 \n",
+ "7 2023-01-01 01:10:00 2023-01-01 01:20:00 26.821821 2023-01-01 01:00:00 \n",
+ "8 2023-01-01 01:20:00 2023-01-01 01:30:00 0.000000 2023-01-01 01:00:00 \n",
+ "9 2023-01-01 01:30:00 2023-01-01 01:40:00 94.514644 2023-01-01 01:00:00 \n",
+ "10 2023-01-01 01:40:00 2023-01-01 01:50:00 83.035725 2023-01-01 01:00:00 \n",
+ "11 2023-01-01 01:50:00 2023-01-01 02:00:00 1.945755 2023-01-01 01:00:00 \n",
+ "12 2023-01-01 02:00:00 2023-01-01 02:10:00 0.000000 2023-01-01 02:00:00 \n",
+ "13 2023-01-01 02:10:00 2023-01-01 02:20:00 117.962963 2023-01-01 02:00:00 \n",
+ "14 2023-01-01 02:20:00 2023-01-01 02:30:00 134.037037 2023-01-01 02:00:00 \n",
+ "\n",
+ " end_weather # STN direction_wind windspeed_avg_hour \\\n",
+ "0 2023-01-01 01:00:00 215 220.0 110.0 \n",
+ "1 2023-01-01 01:00:00 215 220.0 110.0 \n",
+ "2 2023-01-01 01:00:00 215 220.0 110.0 \n",
+ "3 2023-01-01 01:00:00 215 220.0 110.0 \n",
+ "4 2023-01-01 01:00:00 215 220.0 110.0 \n",
+ "5 2023-01-01 01:00:00 215 220.0 110.0 \n",
+ "6 2023-01-01 02:00:00 285 220.0 160.0 \n",
+ "7 2023-01-01 02:00:00 285 220.0 160.0 \n",
+ "8 2023-01-01 02:00:00 285 220.0 160.0 \n",
+ "9 2023-01-01 02:00:00 285 220.0 160.0 \n",
+ "10 2023-01-01 02:00:00 285 220.0 160.0 \n",
+ "11 2023-01-01 02:00:00 285 220.0 160.0 \n",
+ "12 2023-01-01 03:00:00 310 210.0 140.0 \n",
+ "13 2023-01-01 03:00:00 310 210.0 140.0 \n",
+ "14 2023-01-01 03:00:00 310 210.0 140.0 \n",
+ "\n",
+ " windspeed_avg_10min max_wind_gust ... cloud_cover relative_humidity \\\n",
+ "0 130.0 200.0 ... 8.0 68.0 \n",
+ "1 130.0 200.0 ... 8.0 68.0 \n",
+ "2 130.0 200.0 ... 8.0 68.0 \n",
+ "3 130.0 200.0 ... 8.0 68.0 \n",
+ "4 130.0 200.0 ... 8.0 68.0 \n",
+ "5 130.0 200.0 ... 8.0 68.0 \n",
+ "6 150.0 200.0 ... NaN NaN \n",
+ "7 150.0 200.0 ... NaN NaN \n",
+ "8 150.0 200.0 ... NaN NaN \n",
+ "9 150.0 200.0 ... NaN NaN \n",
+ "10 150.0 200.0 ... NaN NaN \n",
+ "11 150.0 200.0 ... NaN NaN \n",
+ "12 130.0 170.0 ... 8.0 72.0 \n",
+ "13 130.0 170.0 ... 8.0 72.0 \n",
+ "14 130.0 170.0 ... 8.0 72.0 \n",
+ "\n",
+ " weather_code indicator_present_weather_code fog rain snow thunder \\\n",
+ "0 NaN 5 0.0 0.0 0.0 0.0 \n",
+ "1 NaN 5 0.0 0.0 0.0 0.0 \n",
+ "2 NaN 5 0.0 0.0 0.0 0.0 \n",
+ "3 NaN 5 0.0 0.0 0.0 0.0 \n",
+ "4 NaN 5 0.0 0.0 0.0 0.0 \n",
+ "5 NaN 5 0.0 0.0 0.0 0.0 \n",
+ "6 NaN 6 NaN NaN NaN NaN \n",
+ "7 NaN 6 NaN NaN NaN NaN \n",
+ "8 NaN 6 NaN NaN NaN NaN \n",
+ "9 NaN 6 NaN NaN NaN NaN \n",
+ "10 NaN 6 NaN NaN NaN NaN \n",
+ "11 NaN 6 NaN NaN NaN NaN \n",
+ "12 NaN 5 0.0 0.0 0.0 0.0 \n",
+ "13 NaN 5 0.0 0.0 0.0 0.0 \n",
+ "14 NaN 5 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ " ice_formation steps_kalman \n",
+ "0 0.0 20.465898 \n",
+ "1 0.0 50.579339 \n",
+ "2 0.0 73.204683 \n",
+ "3 0.0 33.931264 \n",
+ "4 0.0 40.949190 \n",
+ "5 0.0 87.334058 \n",
+ "6 NaN 109.658711 \n",
+ "7 NaN 64.629329 \n",
+ "8 NaN 29.496724 \n",
+ "9 NaN 64.840710 \n",
+ "10 NaN 74.731600 \n",
+ "11 NaN 35.164897 \n",
+ "12 0.0 16.049105 \n",
+ "13 0.0 71.449923 \n",
+ "14 0.0 105.472550 \n",
+ "\n",
+ "[15 rows x 29 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "data_table = New_weather_steps\n",
+ "col = \"steps\"\n",
+ "data_table = apply_kalman_filter(data_table, col)\n",
+ "display(data_table.iloc[:15])"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Missing_data/apply_kalman_filter.py b/Missing_data/apply_kalman_filter.py
new file mode 100644
index 00000000..ee3e1af9
--- /dev/null
+++ b/Missing_data/apply_kalman_filter.py
@@ -0,0 +1,28 @@
+import pandas as pd
+import numpy as np
+from pykalman import KalmanFilter
+
+New_weather_steps = pd.read_csv("Transformed_weather_steps.csv")
+New_weather_steps = New_weather_steps.drop("Unnamed: 0",axis=1)
+
+# Implements the Kalman filter for single columns.
+def apply_kalman_filter(data_table, col):
+ # Initialize the Kalman filter with the trivial transition and observation matrices.
+ kf = KalmanFilter(transition_matrices=[[1]], observation_matrices=[[1]])
+
+ numpy_array_state = data_table[col].values
+ numpy_array_state = numpy_array_state.astype(np.float32)
+ numpy_matrix_state_with_mask = np.ma.masked_invalid(numpy_array_state)
+
+ # Find the best other parameters based on the data (e.g. Q)
+ kf = kf.em(numpy_matrix_state_with_mask, n_iter=5)
+
+ # And apply the filter.
+ (new_data, filtered_state_covariances) = kf.filter(numpy_matrix_state_with_mask)
+
+ data_table[col + '_kalman'] = new_data
+ return(data_table)
+
+data_table = New_weather_steps
+col = "steps"
+data_table = apply_kalman_filter(data_table, col)
\ No newline at end of file
diff --git a/PythonCode/Chapter2/CreateDataset.py b/PythonCode/Chapter2/CreateDataset.py
deleted file mode 100644
index 32302c5b..00000000
--- a/PythonCode/Chapter2/CreateDataset.py
+++ /dev/null
@@ -1,125 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 2 #
-# #
-##############################################################
-
-import pandas as pd
-import numpy as np
-import re
-import copy
-from datetime import datetime, timedelta
-import matplotlib.pyplot as plot
-import matplotlib.dates as md
-
-
-class CreateDataset():
-
- base_dir = ''
- granulairity = 0
- data_table = None
-
- def __init__(self, base_dir, granularity):
- self.base_dir = base_dir
- self.granularity = granularity
-
- # Create an initial data table with entries from start till end time, with steps
- # of size granularity. Granularity is specified in milliseconds
- def create_timestamps(self, start_time, end_time):
- return pd.date_range(start_time, end_time, freq=str(self.granularity)+'ms')
-
- def create_dataset(self, start_time, end_time, cols, prefix):
- c = copy.deepcopy(cols)
- if not prefix == '':
- for i in range(0, len(c)):
- c[i] = str(prefix) + str(c[i])
- timestamps = self.create_timestamps(start_time, end_time)
- self.data_table = pd.DataFrame(index=timestamps, columns=c)
-
- # Add numerical data, we assume timestamps in the form of nanoseconds from the epoch
- def add_numerical_dataset(self, file, timestamp_col, value_cols, aggregation='avg', prefix=''):
- dataset = pd.read_csv(self.base_dir + file, skipinitialspace=True)
-
- # Convert timestamps to dates
- dataset[timestamp_col] = pd.to_datetime(dataset[timestamp_col])
-
- # Create a table based on the times found in the dataset
- if self.data_table is None:
- self.create_dataset(min(dataset[timestamp_col]), max(dataset[timestamp_col]), value_cols, prefix)
- else:
- for col in value_cols:
- self.data_table[str(prefix) + str(col)] = np.nan
-
- # Over all rows in the new table
- for i in range(0, len(self.data_table.index)):
- # Select the relevant measurements.
- relevant_rows = dataset[
- (dataset[timestamp_col] >= self.data_table.index[i]) &
- (dataset[timestamp_col] < (self.data_table.index[i] +
- timedelta(milliseconds=self.granularity)))
- ]
- for col in value_cols:
- # Take the average value
- if len(relevant_rows) > 0:
- if aggregation == 'avg':
- self.data_table.loc[self.data_table.index[i], str(prefix)+str(col)] = np.average(relevant_rows[col])
- else:
- raise ValueError("Unknown aggregation '" + aggregation + "'")
- else:
- self.data_table.loc[self.data_table.index[i], str(prefix)+str(col)] = np.nan
-
- # Remove undesired value from the names.
- def clean_name(self, name):
- return re.sub('[^0-9a-zA-Z]+', '', name)
-
- # Add data in which we have rows that indicate the occurrence of a certain event with a given start and end time.
- # 'aggregation' can be 'sum' or 'binary'.
- def add_event_dataset(self, file, start_timestamp_col, end_timestamp_col, value_col, aggregation='sum'):
- dataset = pd.read_csv(self.base_dir + file)
-
- # Convert timestamps to datetime.
- dataset[start_timestamp_col] = pd.to_datetime(dataset[start_timestamp_col])
- dataset[end_timestamp_col] = pd.to_datetime(dataset[end_timestamp_col])
-
- # Clean the event values in the dataset
- dataset[value_col] = dataset[value_col].apply(self.clean_name)
- event_values = dataset[value_col].unique()
-
- # Add columns for all possible values (or create a new dataset if empty), set the default to 0 occurrences
- if self.data_table is None:
- self.create_dataset(min(dataset[start_timestamp_col]), max(dataset[end_timestamp_col]), event_values, value_col)
- for col in event_values:
- self.data_table[(str(value_col) + str(col))] = 0
-
- # Now we need to start counting by passing along the rows....
- for i in range(0, len(dataset.index)):
- # identify the time points of the row in our dataset and the value
- start = dataset[start_timestamp_col][i]
- end = dataset[end_timestamp_col][i]
- value = dataset[value_col][i]
- border = (start - timedelta(milliseconds=self.granularity))
-
- # get the right rows from our data table
- relevant_rows = self.data_table[(start <= (self.data_table.index +timedelta(milliseconds=self.granularity))) & (end > self.data_table.index)]
-
- # and add 1 to the rows if we take the sum
- if aggregation == 'sum':
- self.data_table.loc[relevant_rows.index, str(value_col) + str(value)] += 1
- # or set to 1 if we just want to know it happened
- elif aggregation == 'binary':
- self.data_table.loc[relevant_rows.index, str(value_col) + str(value)] = 1
- else:
- raise ValueError("Unknown aggregation '" + aggregation + "'")
-
- # This function returns the column names that have one of the strings expressed by 'ids' in the column name.
- def get_relevant_columns(self, ids):
- relevant_dataset_cols = []
- cols = list(self.data_table.columns)
-
- for id in ids:
- relevant_dataset_cols.extend([col for col in cols if id in col])
-
- return relevant_dataset_cols
diff --git a/PythonCode/Chapter2/__init__.py b/PythonCode/Chapter2/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/PythonCode/Chapter3/DataTransformation.py b/PythonCode/Chapter3/DataTransformation.py
deleted file mode 100644
index c8902bd8..00000000
--- a/PythonCode/Chapter3/DataTransformation.py
+++ /dev/null
@@ -1,70 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 3 #
-# #
-##############################################################
-
-import numpy as np
-import matplotlib.pyplot as plot
-from sklearn.decomposition import PCA
-import math
-import copy
-import util.util as util
-from scipy.signal import butter, lfilter, filtfilt
-
-# This class removes the high frequency data (that might be considered noise) from the data.
-class LowPassFilter:
-
- def low_pass_filter(self, data_table, col, sampling_frequency, cutoff_frequency, order=5, phase_shift=True):
- # http://stackoverflow.com/questions/12093594/how-to-implement-band-pass-butterworth-filter-with-scipy-signal-butter
- # Cutoff frequencies are expressed as the fraction of the Nyquist frequency, which is half the sampling frequency
- nyq = 0.5 * sampling_frequency
- cut = cutoff_frequency / nyq
- b, a = butter(order, cut, btype='low', analog=False)
- if phase_shift:
- data_table[col + '_lowpass'] = filtfilt(b, a, data_table[col])
- else:
- data_table[col + '_lowpass'] = lfilter(b, a, data_table[col])
- return data_table
-
-# Class for Principal Component Analysis. We can only apply this when we do not have missing values (i.e. NaN).
-# For this we have to impute these first, be aware of this.
-class PrincipalComponentAnalysis:
-
- pca = []
-
- def __init__(self):
- self.pca = []
-
- # Perform the PCA on the selected columns and return the explained variance.
- def determine_pc_explained_variance(self, data_table, cols):
- # Normalize the data first.
- dt_norm = util.normalize_dataset(data_table, cols)
-
- # perform the PCA.
- self.pca = PCA(n_components = len(cols))
- self.pca.fit(dt_norm[cols])
- # And return the explained variances.
- return self.pca.explained_variance_ratio_
-
- # Apply a PCA given the number of components we have selected.
- # We add new pca columns.
- def apply_pca(self, data_table, cols, number_comp):
- # Normalize the data first.
- dt_norm = util.normalize_dataset(data_table, cols)
-
- # perform the PCA.
- self.pca = PCA(n_components = number_comp)
- self.pca.fit(dt_norm[cols])
-
- # Transform our old values.
- new_values = self.pca.transform(dt_norm[cols])
-
- #And add the new ones:
- for comp in range(0, number_comp):
- data_table['pca_' +str(comp+1)] = new_values[:,comp]
-
- return data_table
diff --git a/PythonCode/Chapter3/ImputationMissingValues.py b/PythonCode/Chapter3/ImputationMissingValues.py
deleted file mode 100644
index 6e6ab095..00000000
--- a/PythonCode/Chapter3/ImputationMissingValues.py
+++ /dev/null
@@ -1,28 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 3 #
-# #
-##############################################################
-
-# Simple class to impute missing values of a single columns.
-class ImputationMissingValues:
-
- # Impute the mean values in case if missing data.
- def impute_mean(self, dataset, col):
- dataset[col] = dataset[col].fillna(dataset[col].mean())
- return dataset
-
- # Impute the median values in case if missing data.
- def impute_median(self, dataset, col):
- dataset[col] = dataset[col].fillna(dataset[col].median())
- return dataset
-
- # Interpolate the dataset based on previous/next values..
- def impute_interpolate(self, dataset, col):
- dataset[col] = dataset[col].interpolate()
- # And fill the initial data points if needed:
- dataset[col] = dataset[col].fillna(method='bfill')
- return dataset
\ No newline at end of file
diff --git a/PythonCode/Chapter3/KalmanFilters.py b/PythonCode/Chapter3/KalmanFilters.py
deleted file mode 100644
index 74f6257d..00000000
--- a/PythonCode/Chapter3/KalmanFilters.py
+++ /dev/null
@@ -1,36 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 3 #
-# #
-##############################################################
-
-import numpy as np
-from pykalman import KalmanFilter
-
-# Implements the Kalman filter for single columns.
-class KalmanFilters:
-
- # Very simple Kalman filter: fill missing values and remove outliers for single attribute.
- # We assume a very simple transition matrix, namely simply a [[1]]. It
- # is however still useful as it is able to dampen outliers and impute missing values. The new
- # values are appended in a new column.
- def apply_kalman_filter(self, data_table, col):
-
- # Initialize the Kalman filter with the trivial transition and observation matrices.
- kf = KalmanFilter(transition_matrices = [[1]], observation_matrices = [[1]])
-
- numpy_array_state = data_table.as_matrix(columns=[col])
- numpy_array_state = numpy_array_state.astype(np.float32)
- numpy_matrix_state_with_mask = np.ma.masked_invalid(numpy_array_state)
-
- # Find the best other parameters based on the data (e.g. Q)
- kf = kf.em(numpy_matrix_state_with_mask, n_iter=5)
-
- # And apply the filter.
- (new_data, filtered_state_covariances) = kf.filter(numpy_matrix_state_with_mask)
-
- data_table[col + '_kalman'] = new_data
- return data_table
diff --git a/PythonCode/Chapter3/OutlierDetection.py b/PythonCode/Chapter3/OutlierDetection.py
deleted file mode 100644
index 77ad49ab..00000000
--- a/PythonCode/Chapter3/OutlierDetection.py
+++ /dev/null
@@ -1,165 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 3 #
-# #
-##############################################################
-
-import scipy
-import math
-from sklearn import mixture
-import numpy as np
-import pandas as pd
-import util.util as util
-import copy
-
-# Class for outlier detection algorithms based on some distribution of the data. They
-# all consider only single points per row (i.e. one column).
-class DistributionBasedOutlierDetection:
-
- # Finds outliers in the specified column of datatable and adds a binary column with
- # the same name extended with '_outlier' that expresses the result per data point.
- def chauvenet(self, data_table, col):
- # Taken partly from: https://www.astro.rug.nl/software/kapteyn/
-
- # Computer the mean and standard deviation.
- mean = data_table[col].mean()
- std = data_table[col].std()
- N = len(data_table.index)
- criterion = 1.0/(2*N)
-
- # Consider the deviation for the data points.
- deviation = abs(data_table[col] - mean)/std
-
- # Express the upper and lower bounds.
- low = -deviation/math.sqrt(2)
- high = deviation/math.sqrt(2)
- prob = []
- mask = []
-
- # Pass all rows in the dataset.
- for i in range(0, len(data_table.index)):
- # Determine the probability of observing the point
- prob.append(1.0 - 0.5 * (scipy.special.erf(high[i]) - scipy.special.erf(low[i])))
- # And mark as an outlier when the probability is below our criterion.
- mask.append(prob[i] < criterion)
- data_table[col + '_outlier'] = mask
- return data_table
-
- # Fits a mixture model towards the data expressed in col and adds a column with the probability
- # of observing the value given the mixture model.
- def mixture_model(self, data_table, col):
- # Fit a mixture model to our data.
- data = data_table[data_table[col].notnull()][col]
- g = mixture.GMM(n_components=3, n_iter=1)
-
- g.fit(data.reshape(-1,1))
-
- # Predict the probabilities
- probs = g.score(data.reshape(-1,1))
-
- # Create the right data frame and concatenate the two.
- data_probs = pd.DataFrame(np.power(10, probs), index=data.index, columns=[col+'_mixture'])
- data_table = pd.concat([data_table, data_probs], axis=1)
-
- return data_table
-
-# Class for distance based outlier detection.
-class DistanceBasedOutlierDetection:
-
-
- # Create distance table between rows in the data table. Here, only cols are considered and the specified
- # distance function is used to compute the distance.
- def distance_table(self, data_table, cols, d_function):
- return pd.DataFrame(scipy.spatial.distance.squareform(util.distance(data_table.ix[:, cols], d_function)), columns=data_table.index, index=data_table.index)
-
- # The most simple distance based algorithm. We assume a distance function, e.g. 'euclidean'
- # and a minimum distance of neighboring points and frequency of occurrence.
- def simple_distance_based(self, data_table, cols, d_function, dmin, fmin):
- # Normalize the dataset first.
- new_data_table = util.normalize_dataset(data_table.dropna(axis=0, subset=cols), cols)
- # Create the distance table first between all instances:
- distances = self.distance_table(new_data_table, cols, d_function)
-
- mask = []
- # Pass the rows in our table.
- for i in range(0, len(new_data_table.index)):
- # Check what faction of neighbors are beyond dmin.
- frac = (float(sum([1 for col_val in distances.ix[i,:].tolist() if col_val > dmin]))/len(new_data_table.index))
- # Mark as an outlier if beyond the minimum frequency.
- mask.append(frac > fmin)
- data_mask = pd.DataFrame(mask, index=new_data_table.index, columns=['simple_dist_outlier'])
- data_table = pd.concat([data_table, data_mask], axis=1)
- return data_table
-
- # Computes the local outlier factor. K is the number of neighboring points considered, d_function
- # the distance function again (e.g. 'euclidean').
- def local_outlier_factor(self, data_table, cols, d_function, k):
- # Inspired on https://github.com/damjankuznar/pylof/blob/master/lof.py
- # but tailored towards the distance metrics and data structures used here.
-
- # Normalize the dataset first.
- new_data_table = util.normalize_dataset(data_table.dropna(axis=0, subset=cols), cols)
- # Create the distance table first between all instances:
- self.distances = self.distance_table(new_data_table, cols, d_function)
-
- outlier_factor = []
- # Compute the outlier score per row.
- for i in range(0, len(new_data_table.index)):
- print i
- outlier_factor.append(self.local_outlier_factor_instance(i, k))
- data_outlier_probs = pd.DataFrame(outlier_factor, index=new_data_table.index, columns=['lof'])
- data_table = pd.concat([data_table, data_outlier_probs], axis=1)
- return data_table
-
- # The distance between a row i1 and i2.
- def reachability_distance(self, k, i1, i2):
- # Compute the k-distance of i2.
- k_distance_value, neighbors = self.k_distance(i2, k)
- # The value is the max of the k-distance of i2 and the real distance.
- return max([k_distance_value, self.distances.ix[i1,i2]])
-
- # Compute the local reachability density for a row i, given a k-distance and set of neighbors.
- def local_reachability_density(self, i, k, k_distance_i, neighbors_i):
- # Set distances to neighbors to 0.
- reachability_distances_array = [0]*len(neighbors_i)
-
- # Compute the reachability distance between i and all neighbors.
- for i, neighbor in enumerate(neighbors_i):
- reachability_distances_array[i] = self.reachability_distance(k, i, neighbor)
- if not any(reachability_distances_array):
- return float("inf")
- else:
- # Return the number of neighbors divided by the sum of the reachability distances.
- return len(neighbors_i) / sum(reachability_distances_array)
-
- # Compute the k-distance of a row i, namely the maximum distance within the k nearest neighbors
- # and return a tuple containing this value and the neighbors within this distance.
- def k_distance(self, i, k):
- # Simply look up the values in the distance table, select the min_pts^th lowest value and take the value pairs
- # Take min_pts + 1 as we also have the instance itself in there.
- neighbors = np.argpartition(np.array(self.distances.ix[i,:]), k+1)[0:(k+1)].tolist()
- if i in neighbors:
- neighbors.remove(i)
- return max(self.distances.ix[i,neighbors]), neighbors
-
- # Compute the local outlier score of our row i given a setting for k.
- def local_outlier_factor_instance(self, i, k):
- # Compute the k-distance for i.
- k_distance_value, neighbors = self.k_distance(i, k)
- # Computer the local reachability given the found k-distance and neighbors.
- instance_lrd = self.local_reachability_density(i, k, k_distance_value, neighbors)
- lrd_ratios_array = [0]* len(neighbors)
-
- # Computer the k-distances and local reachability density of the neighbors
- for i, neighbor in enumerate(neighbors):
- k_distance_value_neighbor, neighbors_neighbor = self.k_distance(neighbor, k)
- neighbor_lrd = self.local_reachability_density(neighbor, k, k_distance_value_neighbor, neighbors_neighbor)
- # Store the ratio between the neighbor and the row i.
- lrd_ratios_array[i] = neighbor_lrd / instance_lrd
-
- # Return the average ratio.
- return sum(lrd_ratios_array) / len(neighbors)
-
diff --git a/PythonCode/Chapter3/__init__.py b/PythonCode/Chapter3/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/PythonCode/Chapter4/FrequencyAbstraction.py b/PythonCode/Chapter4/FrequencyAbstraction.py
deleted file mode 100644
index d86ad66f..00000000
--- a/PythonCode/Chapter4/FrequencyAbstraction.py
+++ /dev/null
@@ -1,55 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 4 #
-# #
-##############################################################
-
-import numpy as np
-
-
-# This class performs a Fourier transformation on the data to find frequencies that occur
-# often and filter noise.
-class FourierTransformation:
-
- # Find the amplitudes of the different frequencies using a fast fourier transformation. Here,
- # the sampling rate expresses
- # the number of samples per second (i.e. Frequency is Hertz of the dataset).
- def find_fft_transformation(self, data, sampling_rate):
- # Create the transformation, this includes the amplitudes of both the real
- # and imaginary part.
- transformation = np.fft.rfft(data, len(data))
- return transformation.real, transformation.imag
-
- # Get frequencies over a certain window.
- def abstract_frequency(self, data_table, cols, window_size, sampling_rate):
-
- # Create new columns for the frequency data.
- freqs = np.fft.rfftfreq(int(window_size)) *sampling_rate
-
- for col in cols:
- data_table[col + '_max_freq'] = np.nan
- data_table[col + '_freq_weighted'] = np.nan
- data_table[col + '_pse'] = np.nan
- for freq in freqs:
- data_table[col + '_freq_' + str(freq) + '_Hz_ws_' + str(window_size)] = np.nan
-
- # Pass over the dataset (we cannot compute it when we do not have enough history)
- # and compute the values.
- for i in range(window_size, len(data_table.index)):
- for col in cols:
- real_ampl, imag_ampl = self.find_fft_transformation(data_table[col][i-window_size:min(i+1, len(data_table.index))], sampling_rate)
- # We only look at the real part in this implementation.
- for j in range(0, len(freqs)):
- data_table.ix[i, col + '_freq_' + str(freqs[j]) + '_Hz_ws_' + str(window_size)] = real_ampl[j]
- # And select the dominant frequency. We only consider the positive frequencies for now.
-
- data_table.ix[i, col + '_max_freq'] = freqs[np.argmax(real_ampl[0:len(real_ampl)])]
- data_table.ix[i, col + '_freq_weighted'] = float(np.sum(freqs * real_ampl)) / np.sum(real_ampl)
- PSD = np.divide(np.square(real_ampl),float(len(real_ampl)))
- PSD_pdf = np.divide(PSD, np.sum(PSD))
- data_table.ix[i, col + '_pse'] = -np.sum(np.log(PSD_pdf) * PSD_pdf)
-
- return data_table
diff --git a/PythonCode/Chapter4/TemporalAbstraction.py b/PythonCode/Chapter4/TemporalAbstraction.py
deleted file mode 100644
index 0b9803ba..00000000
--- a/PythonCode/Chapter4/TemporalAbstraction.py
+++ /dev/null
@@ -1,194 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 4 #
-# #
-##############################################################
-
-import numpy as np
-import scipy.stats as stats
-
-# Class to abstract a history of numerical values we can use as an attribute.
-class NumericalAbstraction:
-
-
- # This function aggregates a list of values using the specified aggregation
- # function (which can be 'mean', 'max', 'min', 'median', 'std', 'slope')
- def aggregate_value(self, data, aggregation_function):
- # Compute the values and return the result.
- if aggregation_function == 'mean':
- return data.mean(skipna = True)
- elif aggregation_function == 'max':
- return data.max(skipna = True)
- elif aggregation_function == 'min':
- return data.min(skipna = True)
- elif aggregation_function == 'median':
- return data.median(skipna = True)
- elif aggregation_function == 'std':
- return data.std(skipna = True)
- elif aggregation_function == 'slope':
- # For the slope we need a bit more work.
- # We create time points, assuming discrete time steps with fixed delta t:
- times = np.array(range(0, len(data.index)))
- data = data.as_matrix().astype(np.float32)
-
- # Check for NaN's
- mask = ~np.isnan(data)
-
- # If we have no data but NaN we return NaN.
- if (len(data[mask]) == 0):
- return np.nan
- # Otherwise we return the slope.
- else:
- slope, intercept, r_value, p_value, std_err = stats.linregress(times[mask], data[mask])
- return slope
- else:
- return np.nan
-
- # Abstract numerical columns specified given a window size (i.e. the number of time points from
- # the past considered) and an aggregation function.
- def abstract_numerical(self, data_table, cols, window_size, aggregation_function):
-
- # Create new columns for the temporal data.
- for col in cols:
- data_table[col + '_temp_' + aggregation_function + '_ws_' + str(window_size)] = np.nan
-
- # Pass over the dataset (we cannot compute it when we do not have enough history)
- # and compute the values.
- for i in range(window_size, len(data_table.index)):
- for col in cols:
- data_table.ix[i, col + '_temp_' + aggregation_function + '_ws_' +str(window_size)] = self.aggregate_value(data_table[col][i-window_size:min(i+1, len(data_table.index))], aggregation_function)
-
- return data_table
-
-# Class to perform categorical abstraction. We obtain patterns of categorical attributes that occur frequently
-# over time.
-class CategoricalAbstraction:
-
- pattern_prefix = 'temp_pattern_'
- before = '(b)'
- co_occurs = '(c)'
- cache = {}
-
- # Determine the time points a pattern occurs in the dataset given a windows size.
- def determine_pattern_times(self, data_table, pattern, window_size):
- times = []
-
- # If we have a pattern of length one
- if len(pattern) == 1:
- # If it is in the cache, we get the times from the cache.
- if self.cache.has_key(self.to_string(pattern)):
- times = self.cache[self.to_string(pattern)]
- # Otherwise we identify the time points at which we observe the value.
- else:
- timestamp_rows = data_table[data_table[pattern[0]] > 0].index.values.tolist()
- times = [data_table.index.get_loc(i) for i in timestamp_rows]
- self.cache[self.to_string(pattern)] = times
-
- # If we have a complex pattern ( (b) or (c) )
- elif len(pattern) == 3:
- # We computer the time points of and
- time_points_first_part = self.determine_pattern_times(data_table, pattern[0], window_size)
- time_points_second_part = self.determine_pattern_times(data_table, pattern[2], window_size)
-
- # If it co-occurs we take the intersection.
- if pattern[1] == self.co_occurs:
- # No use for co-occurences of the same patterns...
- if pattern[0] == pattern[2]:
- times = []
- else:
- times = list(set(time_points_first_part) & set(time_points_second_part))
- # Otherwise we take all time points from at which we observed within the given
- # window size.
- elif pattern[1] == self.before:
- for t in time_points_second_part:
- if len([i for i in time_points_first_part if ((i >= t - window_size) & (i < t))]):
- times.append(t)
- return times
-
- # Create a string representation of a pattern.
- def to_string(self, pattern):
- # If we just have one component, return the string.
- if len(pattern) == 1:
- return str(pattern[0])
- # Otherwise, return the merger of the strings of all
- # components.
- else:
- name = ''
- for p in pattern:
- name = name + self.to_string(p)
- return name
-
- # Selects the patterns from 'patterns' that meet the minimum support in the dataset
- # given the window size.
- def select_k_patterns(self, data_table, patterns, min_support, window_size):
- selected_patterns = []
- for pattern in patterns:
- # Determine the times at which the pattern occurs.
- times = self.determine_pattern_times(data_table, pattern, window_size)
- # Compute the support
- support = float(len(times))/len(data_table.index)
- # If we meet the minum support, append the selected patterns and set the
- # value to 1 at which it occurs.
- if support >= min_support:
- selected_patterns.append(pattern)
- print self.to_string(pattern)
- # Set the occurrence of the pattern in the row to 0.
- data_table[self.pattern_prefix + self.to_string(pattern)] = 0
- data_table.ix[times, self.pattern_prefix + self.to_string(pattern)] = 1
- return data_table, selected_patterns
-
-
- # extends a set of k-patterns with the 1-patterns that have sufficient support.
- def extend_k_patterns(self, k_patterns, one_patterns):
- new_patterns = []
- for k_p in k_patterns:
- for one_p in one_patterns:
- # Add a before relationship
- new_patterns.append([k_p, self.before, one_p])
- # Add a co-occurs relationship.
- new_patterns.append([k_p, self.co_occurs, one_p])
- return new_patterns
-
-
- # Function to abstract our categorical data. Note that we assume a list of binary columns representing
- # the different categories. We set whether the column names should match exactly 'exact' or should include the
- # specified name 'like'. We also express a minimum support,a windows size between succeeding patterns and a
- # maximum size for the number of patterns.
- def abstract_categorical(self, data_table, cols, match, min_support, window_size, max_pattern_size):
-
- # Find all the relevant columns of binary attributes.
- col_names = list(data_table.columns)
- selected_patterns = []
-
- relevant_dataset_cols = []
- for i in range(0, len(cols)):
- if match[i] == 'exact':
- relevant_dataset_cols.append(cols[i])
- else:
- relevant_dataset_cols.extend([name for name in col_names if cols[i] in name])
-
- # Generate the one patterns first
- potential_1_patterns = [[pattern] for pattern in relevant_dataset_cols]
-
- new_data_table, one_patterns = self.select_k_patterns(data_table, potential_1_patterns, min_support, window_size)
- selected_patterns.extend(one_patterns)
- print 'Number of patterns of size 1 is ' + str(len(one_patterns))
-
- k = 1
- k_patterns = one_patterns
-
- # And generate all following patterns.
- while (k < max_pattern_size) & (len(k_patterns) > 0):
- k = k + 1
- potential_k_patterns = self.extend_k_patterns(k_patterns, one_patterns)
- new_data_table, selected_new_k_patterns = self.select_k_patterns(new_data_table, potential_k_patterns, min_support, window_size)
- selected_patterns.extend(selected_new_k_patterns)
- print 'Number of patterns of size ' + str(k) + ' is ' + str(len(selected_new_k_patterns))
-
- return new_data_table
-
-
-
diff --git a/PythonCode/Chapter4/TextAbstraction.py b/PythonCode/Chapter4/TextAbstraction.py
deleted file mode 100644
index 6202c65b..00000000
--- a/PythonCode/Chapter4/TextAbstraction.py
+++ /dev/null
@@ -1,197 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 4 #
-# #
-##############################################################
-
-from nltk import tokenize
-from nltk import parse
-from nltk.stem.snowball import SnowballStemmer
-import unicodedata
-import unidecode
-from nltk.corpus import stopwords
-import nltk
-import math
-import gensim
-import gensim.models.ldamodel as lda
-
-# This class includes a number of approaches that abstract text based data to structured features.
-class TextAbstraction:
-
- col_name = 'words'
- bow = 'bow'
-
- # Tokenize the text: identify sentences and words within sentences. Returns a list of words.
- def tokenization(self, text):
- words = []
- sentences = tokenize.sent_tokenize(text)
- for sentence in sentences:
- words.extend(tokenize.word_tokenize(sentence))
- words = list(filter(lambda x: x != '.', words))
- return words
-
- # Create a clean set of words which are lower case and do not include any undesired characters.
- # Returns the cleaned set.
- def lower_case_and_filter_chars(self, words):
- new_words = []
- for word in words:
- # Take the lower case.
- word = word.lower()
- try:
- # Use the proper coding.
- word = word.decode('utf-8')
- except:
- word = word
- # something went wrong with the decoding, don't care for now.
-
- word = unidecode.unidecode(word)
- newText = ''
-
- # Select only the letters from the alphabet.
- for c in word:
- if ((c >= 'a' and c <= 'z') or (c >= 'A' and c <= 'Z') or c == ' '):
- newText = newText + c
- if len(newText) > 0:
- new_words.append(newText)
- return new_words
-
-
- # Stem a list of words. Return the list of stemmed words.
- def stem(self, text):
- stemmer = SnowballStemmer("english")
- newText = []
- for w in text:
- newText.append(str(stemmer.stem(w)))
- return newText
-
- # Remove stopwords from a list of words. Returns the cleaned list.
- def remove_stop_words(self, text):
- stopwordList = stopwords.words('english')
- names = nltk.corpus.names.words()
-
- newText = []
- for w in text:
- if w.lower() not in stopwordList and w.lower() not in names:
- newText.append(w)
- return newText
-
- # Create combinations of words for n-grams. Return a list of elements
- # that are the combination of words that occur adjacent.
- def form_n_grams(self, words, n):
- n_grams = []
- for i in range(0, len(words)-n):
- n_grams.append('_'.join(words[i:i+n]))
- return n_grams
-
- # Identify words in the specified columns, create n-grams. Returns the same
- # data table with a column with text where each row is a list of cleaned n-grams
- # that occur in the cols of that row and the list of unique words is returned.
- def identify_words(self, data_table, cols, n):
- word_attributes = []
-
- # Create the cleaned text column.
- data_table[self.col_name] = 0
- data_table[self.col_name] = data_table[self.col_name].astype(object)
-
- # Pass all rows.
- for i in range(0, len(data_table.index)):
- words = []
- text = ''
- for col in cols:
- text = text + ' ' + data_table.ix[i, col]
-
- # Perform the NLP pipeline.
- words = self.tokenization(text)
- lower_case_words = self.lower_case_and_filter_chars(words)
- stemmed_words = self.stem(lower_case_words)
- no_stopwords_words = self.remove_stop_words(stemmed_words)
- n_grams = self.form_n_grams(no_stopwords_words, n)
- current_set = set(word_attributes)
- new_set = set(n_grams)
- # Store the current set of n-grams found.
- word_attributes = list(current_set | new_set)
- # And add the found list of words to the table.
- data_table.set_value(i, self.col_name, n_grams)
-
- return data_table, word_attributes
-
- # Apply the bag of words approach upon the text that can be found in cols. It identifies
- # n-grams and creates columns for those n-grams. It computes the number of occurrences
- # of the n-grams per row and uses that as a value.
- def bag_of_words(self, data_table, cols, n):
-
- # Identify the words and clean the table.
- data_table, words = self.identify_words(data_table, cols, n)
-
- # Create columns for each word.
- for word in words:
- data_table[cols[0] + '_bow_' + word] = 0
-
- # And count the occurrences per row.
- for i in range(0, len(data_table.index)):
- data_table.ix[i, cols[0] + '_bow_' + word] = data_table.ix[i, self.col_name].count(word)
-
- # Remove the temporary column we had created for the cleaned lists of words.
- del data_table[self.col_name]
- return data_table
-
- # Apply the bag of words approach upon the text that can be found in cols. It identifies
- # n-grams and creates columns for those n-grams. It computes the TF-IDF
- # of the n-grams per row and uses that as a value.
- def tf_idf(self, data_table, cols, n):
-
- # Identify the words and clean the table.
- data_table, words = self.identify_words(data_table, cols, n)
-
- # Create columns for each word.
- for word in words:
- data_table[cols[0] + '_tf_idf_' + word] = 0.0
-
- for i in range(0, len(data_table.index)):
-
- # And count the tf score.
- tf = data_table.ix[i, self.col_name].count(word)
- data_table.ix[i, cols[0] + '_tf_idf_' + word] = tf
-
- # Compute the idf score over all rows.
- idf = math.log(float(len(data_table.index))/len(data_table.loc[data_table[cols[0] + '_tf_idf_' + word] > 0].index))
- # and multiply the rows with the idf.
- data_table[cols[0] + '_tf_idf_' + word] = data_table[cols[0] + '_tf_idf_' + word].mul(idf)
- # Remove the temporary column we had created for the cleaned lists of words.
- del data_table[self.col_name]
- return data_table
-
- # This function identifies n topics in the data using LDA and for each row computes a score for the
- # topic. It returns a dataset with columns added for the topics containing the scores per row.
- def topic_modeling(self, data_table, cols, n_topics):
-
- # Identify the words and clean the table.
- data_table, words = self.identify_words(data_table, cols)
-
- # Create a dictionary based on the words we have identified per row.
- dict_topics = gensim.corpora.Dictionary(data_table[self.col_name])
- # Create a corpus containing all words.
- corpus = [dict_topics.doc2bow([word]) for word in words]
-
- # Apply LDA.
- model = lda.LdaModel(corpus, id2word=dict_topics, num_topics=n_topics)
-
- # Get the topics we found.
- topics = model.show_topics(num_topics=n_topics, num_words=10, log=False, formatted=False)
-
- # Create columns for the topics.
- for topic in range(0, n_topics):
- data_table[cols[0] + '_topic_' + str(topic)] = 0.0
-
- # Score the topics per row and set the values accordingly.
- for i in range(0, len(data_table.index)):
- topic_scores = model[dict_topics.doc2bow(data_table.ix[i, self.col_name])]
- for score in topic_scores:
- data_table.ix[i, cols[0] + '_topic_' + str(score[0])] = score[1]
- # Remove the temporary column we had created for the cleaned lists of words.
- del data_table[self.col_name]
- return data_table
-
diff --git a/PythonCode/Chapter4/__init__.py b/PythonCode/Chapter4/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/PythonCode/Chapter5/Clustering.py b/PythonCode/Chapter5/Clustering.py
deleted file mode 100644
index efddf114..00000000
--- a/PythonCode/Chapter5/Clustering.py
+++ /dev/null
@@ -1,329 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 5 #
-# #
-##############################################################
-
-from sklearn.cluster import KMeans
-from Chapter5.DistanceMetrics import InstanceDistanceMetrics
-import sklearn
-import pandas as pd
-import numpy as np
-from sklearn.metrics import silhouette_samples, silhouette_score
-from Chapter5.DistanceMetrics import PersonDistanceMetricsNoOrdering
-from Chapter5.DistanceMetrics import PersonDistanceMetricsOrdering
-import random
-import scipy
-from scipy.cluster.hierarchy import linkage, fcluster
-from sklearn.neighbors import DistanceMetric
-import pyclust
-
-# Implementation of the non hierarchical clustering approaches.
-class NonHierarchicalClustering:
-
- # Global parameters for distance functions
- p = 1
- max_lag = 1
-
- # Identifiers of the various distance and abstraction approaches.
- euclidean = 'euclidean'
- minkowski = 'minkowski'
- manhattan = 'manhattan'
- gower = 'gower'
- abstraction_mean = 'abstraction_mean'
- abstraction_normal = 'abstraction_normal'
- abstraction_p = 'abstraction_p'
- abstraction_euclidean = 'abstract_euclidean'
- abstraction_lag = 'abstract_lag'
- abstraction_dtw = 'abstract_dtw'
-
- # Define the gowers distance between arrays to be used in k-means and k-medoids.
- def gowers_similarity(self, X, Y=None, Y_norm_squared=None, squared=False):
- X = np.matrix(X)
- distances = np.zeros(shape=(X.shape[0], Y.shape[0]))
- DM = InstanceDistanceMetrics()
- # Pairs up the elements in the dataset
- for x_row in range(0, X.shape[0]):
- data_row1 = pd.DataFrame(X[x_row])
- for y_row in range(0, Y.shape[0]):
- data_row2 = pd.DataFrame(Y[y_row]).transpose()
- # And computer the distance as defined in our distance metrics class.
- distances[x_row, y_row] = DM.gowers_similarity(data_row1, data_row2, self.p)
- return np.array(distances)
-
- # Use a predefined distance function for the Minkowski distance
- def minkowski_distance(self, X, Y=None, Y_norm_squared=None, squared=False):
- dist = DistanceMetric.get_metric('minkowski', p=self.p)
- return dist.pairwise(X, Y)
-
- # Use a predefined distance function for the Manhattan distance
- def manhattan_distance(self, X, Y=None, Y_norm_squared=None, squared=False):
- dist = DistanceMetric.get_metric('manhattan')
- return dist.pairwise(X, Y)
-
- # Use a predefined distance function for the Euclidean distance
- def euclidean_distance(self, X, Y=None, Y_norm_squared=None, squared=False):
- dist = DistanceMetric.get_metric('euclidean')
- return dist.pairwise(X, Y)
-
- # If we want to compare dataset between persons one approach is to flatten
- # each dataset to a single record/instance. This is done based on the approaches
- # we have defined in the distance metrics file.
- def aggregate_datasets(self, datasets, cols, abstraction_method):
- temp_datasets = []
- DM = PersonDistanceMetricsNoOrdering()
-
- # Flatten all datasets and add them to the newly formed dataset.
- for i in range(0, len(datasets)):
- temp_dataset = datasets[i][cols]
- temp_datasets.append(temp_dataset)
-
- if abstraction_method == self.abstraction_normal:
- return DM.create_instances_normal_distribution(temp_datasets)
- else:
- return DM.create_instances_mean(temp_datasets)
-
- # Perform k-means over an individual dataset.
- def k_means_over_instances(self, dataset, cols, k, distance_metric, max_iters, n_inits, p=1):
-
- # Take the appropriate columns.
- temp_dataset = dataset[cols]
- # Override the standard distance functions. Store the original first
- sklearn_euclidian_distances = sklearn.cluster.k_means_.euclidean_distances
- if distance_metric == self.euclidean:
- sklearn.cluster.k_means_.euclidean_distances = self.euclidean_distance
- elif distance_metric == self.minkowski:
- self.p = p
- sklearn.cluster.k_means_.euclidean_distances = self.minkowski_distance
- elif distance_metric == self.manhattan:
- sklearn.cluster.k_means_.euclidean_distances = self.manhattan_distance
- elif distance_metric == self.gower:
- self.ranges = []
- for col in temp_dataset.columns:
- self.ranges.append(temp_dataset[col].max() - temp_dataset[col].min())
- sklearn.cluster.k_means_.euclidean_distances = self.gower_similarity
- # If we do not recognize the option we use the default distance function, which is much
- # faster....
- # Now apply the k-means algorithm
- kmeans = KMeans(n_clusters=k, max_iter=max_iters, n_init=n_inits, random_state=0).fit(temp_dataset)
- # Add the labels to the dataset
- dataset['cluster'] = kmeans.labels_
- # Compute the solhouette and add it as well.
- silhouette_avg = silhouette_score(temp_dataset, kmeans.labels_)
- silhouette_per_inst = silhouette_samples(temp_dataset, kmeans.labels_)
- dataset['silhouette'] = silhouette_per_inst
-
- # Reset the module distance function for further usage
- sklearn_euclidian_distances = sklearn_euclidian_distances
-
- return dataset
-
- # We have datasets covering multiple persons. We abstract the datatasets using an approach and create
- # clusters of persons.
- def k_means_over_datasets(self, datasets, cols, k, abstraction_method, distance_metric, max_iters, n_inits, p=1):
- # Convert the datasets to instances
- temp_dataset = self.aggregate_datasets(datasets, cols, abstraction_method)
-
- # And simply apply the instance based algorithm.....
- return self.k_means_over_instances(temp_dataset, temp_dataset.columns, k, distance_metric, max_iters, n_inits, p)
-
- # For our own k-medoids algorithm we use our own implementation. For this we computer a complete distance matrix
- # between points.
- def compute_distance_matrix_instances(self, dataset, distance_metric):
- # If the distance function is not defined in our distance metrics, we use the standard euclidean distance.
- if not (distance_metric in [self.manhattan, self.minkowski, self.gower, self.euclidean]):
- distances = sklearn.metrics.pairwise.euclidean_distances(X=dataset, Y=dataset)
- return pd.DataFrame(distances, index=range(0, len(dataset.index)), columns=range(0, len(dataset.index)))
- # Create an empty pandas dataframe for our distance matrix
- distances = pd.DataFrame(index=range(0, len(dataset.index)), columns=range(0, len(dataset.index)))
- DM = InstanceDistanceMetrics()
-
- # Define the ranges of the columns if we use the gower distance.
- ranges = []
- if distance_metric == self.gower:
- for col in dataset.columns:
- self.ranges.append(dataset[col].max() - dataset[col].min())
-
- # And compute the distances for each pair. Note that we assume the distances to be symmetric.
- for i in range(0, len(dataset.index)):
- for j in range(i, len(dataset.index)):
- if distance_metric == self.manhattan:
- distances.ix[i,j] = self.manhattan_distance(dataset.ix[i:i+1,:], dataset.ix[j:j+1,:])
- elif distance_metric == self.minkowski:
- distances.ix[i,j] = self.manhattan_distance(dataset.ix[i:i+1,:], dataset.ix[j:j+1,:], self.p)
- elif distance_metric == self.gower:
- distances.ix[i,j] = self.gower_distance(dataset.ix[i:i+1,:], dataset.ix[j:j+1,:])
- elif distance_metric == self.euclidean:
- distances.ix[i,j] = self.euclidean_distance(dataset.ix[i:i+1,:], dataset.ix[j:j+1,:])
- distances.ix[j,i] = distances.ix[i,j]
- return distances
-
- # We need to implement k-medoids ourselves to accommodate all distance metrics
- def k_medoids_over_instances(self, dataset, cols, k, distance_metric, max_iters, n_inits=5, p=1):
- # If we set it to default we use the pyclust package...
- temp_dataset = dataset[cols]
- if distance_metric == 'default':
- km = pyclust.KMedoids(n_clusters=k, n_trials=n_inits)
- km.fit(temp_dataset.as_matrix())
- cluster_assignment = km.labels_
-
- else:
- self.p = p
- cluster_assignment = []
- best_silhouette = -1
-
- # Compute all distances
- D = self.compute_distance_matrix_instances(temp_dataset, distance_metric)
-
- for it in range(0, n_inits):
- # First select k random points as centers:
- centers = random.sample(range(0, len(dataset.index)), k)
- prev_centers = []
- points_to_cluster = []
-
- n_iter = 0
- while (n_iter < max_iters) and not (centers == prev_centers):
- n_iter += 1
- prev_centers = centers
- # Assign points to clusters.
- points_to_centroid = D[centers].idxmin(axis=1)
-
- new_centers = []
- for i in range(0, k):
- # And find the new center that minimized the sum of the differences.
- best_center = D.ix[points_to_centroid == centers[i], points_to_centroid == centers[i]].sum().idxmin(axis=1)
- new_centers.append(best_center)
- centers = new_centers
-
- # Convert centroids to cluster numbers:
-
- points_to_centroid = D[centers].idxmin(axis=1)
- current_cluster_assignment = []
- for i in range(0, len(dataset.index)):
- current_cluster_assignment.append(centers.index(points_to_centroid.ix[i,:]))
-
- silhouette_avg = silhouette_score(temp_dataset, np.array(current_cluster_assignment))
- if silhouette_avg > best_silhouette:
- cluster_assignment = current_cluster_assignment
- best_silhouette = silhouette_avg
-
- # And add the clusters and silhouette scores to the dataset.
- dataset['cluster'] = cluster_assignment
- silhouette_avg = silhouette_score(temp_dataset, np.array(cluster_assignment))
- silhouette_per_inst = silhouette_samples(temp_dataset, np.array(cluster_assignment))
- dataset['silhouette'] = silhouette_per_inst
-
- return dataset
-
- # For k-medoids we use all possible distance metrics between datasets as well. For this we
- # again need to define a distance matrix between the datasets.
- def compute_distance_matrix_datasets(self, datasets, distance_metric):
- distances = pd.DataFrame(index=range(0, len(datasets)), columns=range(0, len(datasets)))
- DMNoOrdering = PersonDistanceMetricsNoOrdering()
- DMOrdering = PersonDistanceMetricsOrdering()
-
- # And compute the distances for each pair. Note that we assume the distances to be symmetric.
- for i in range(0, len(datasets)):
- for j in range(i, len(datasets)):
- if distance_metric == self.abstraction_p:
- distances.ix[i,j] = DMNoOrdering.p_distance(datasets[i], datasets[j])
- elif distance_metric == self.abstraction_euclidean:
- distances.ix[i,j] = DMOrdering.euclidean_distance(datasets[i], datasets[j])
- elif distance_metric == self.abstraction_lag:
- distances.ix[i,j] = DMOrdering.lag_correlation(datasets[i], datasets[j], self.max_lag)
- elif distance_metric == self.abstraction_dtw:
- distances.ix[i,j] = DMOrdering.dynamic_time_warping(datasets[i], datasets[j])
- distances.ix[j,i] = distances.ix[i,j]
- return distances
-
- # Note: distance metric only important in combination with certain abstraction methods as we allow for more
- # in k-medoids.
- def k_medoids_over_datasets(self, datasets, cols, k, abstraction_method, distance_metric, max_iters, n_inits=5, p=1, max_lag=5):
- self.p = p
- self.max_lag = max_lag
-
- # If we compare datasets by flattening them, we can simply flatten the dataset and apply the instance based
- # variant.
- if abstraction_method in [self.abstraction_mean, self.abstraction_normal]:
- # Convert the datasets to instances
- temp_dataset = self.aggregate_datasets(datasets, cols, abstraction_method)
-
- # And simply apply the instance based algorithm in case of
- return self.k_medoids_over_instances(temp_dataset, temp_dataset.columns, k, distance_metric, max_iters, n_inits=n_inits, p=p)
-
- # For the case over datasets we do not have a quality metric, therefore we just look at a single initialization for now (!)
-
- # First select k random points as centers:
- centers = random.sample(range(0, len(datasets)), k)
- prev_centers = []
- points_to_cluster = []
- # Compute all distances
- D = self.compute_distance_matrix_datasets(datasets, abstraction_method)
-
- n_iter = 0
- while (n_iter < max_iters) and not (centers == prev_centers):
- n_iter += 1
- prev_centers = centers
- # Assign points to clusters.
- points_to_centroid = D[centers].idxmin(axis=1)
-
- new_centers = []
- for i in range(0, k):
- # And find the new center that minimized the sum of the differences.
- best_center = D.ix[points_to_centroid == centers[i], points_to_centroid == centers[i]].sum().idxmin(axis=1)
- new_centers.append(best_center)
- centers = new_centers
-
- # Convert centroids to cluster numbers:
-
- points_to_centroid = D[centers].idxmin(axis=1)
- cluster_assignment = []
- for i in range(0, len(datasets)):
- cluster_assignment.append(centers.index(points_to_centroid.ix[i,:]))
-
- dataset = pd.DataFrame(index=range(0, len(datasets)))
- dataset['cluster'] = cluster_assignment
-
- # Silhouette cannot be used here as it used a distance between instances, not datasets.
-
- return dataset
-
-# In this class, we do not implement the Gover distance between instance, all others are included.
-# Furthermore, we only implement the agglomerative approach.
-class HierarchicalClustering:
-
- link = None
-
- # Perform agglomerative clustering over a single dataset.
- def agglomerative_over_instances(self, dataset, cols, max_clusters, distance_metric, use_prev_linkage=False, link_function='single'):
- temp_dataset = dataset[cols]
- df = NonHierarchicalClustering()
-
- if (not use_prev_linkage) or (self.link is None):
- # Perform the clustering process according to the specified distance metric.
- if distance_metric == df.manhattan:
- self.link = linkage(temp_dataset.as_matrix(), method=link_function, metric='cityblock')
- else:
- self.link = linkage(temp_dataset.as_matrix(), method=link_function, metric='euclidean')
-
- # And assign the clusters given the set maximum. In addition, compute the
- cluster_assignment = fcluster(self.link, max_clusters, criterion='maxclust')
- dataset['cluster'] = cluster_assignment
- silhouette_avg = silhouette_score(temp_dataset, np.array(cluster_assignment))
- silhouette_per_inst = silhouette_samples(temp_dataset, np.array(cluster_assignment))
- dataset['silhouette'] = silhouette_per_inst
-
- return dataset, self.link
-
- # Perform agglomerative clustering over the datasets by flattening them into a single dataset.
- def agglomerative_over_datasets(self, datasets, cols, max_clusters, abstraction_method, distance_metric, use_prev_linkage=False, link_function='single'):
- # Convert the datasets to instances
- df = NonHierarchicalClustering()
- temp_dataset = df.aggregate_datasets(datasets, cols, abstraction_method)
-
- # And simply apply the instance based algorithm...
- return self.agglomerative_over_instances(temp_dataset, temp_dataset.columns, max_clusters, distance_metric, use_prev_linkage=use_prev_linkage, link_function=link_function)
-
diff --git a/PythonCode/Chapter5/DistanceMetrics.py b/PythonCode/Chapter5/DistanceMetrics.py
deleted file mode 100644
index 7c432bde..00000000
--- a/PythonCode/Chapter5/DistanceMetrics.py
+++ /dev/null
@@ -1,188 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 5 #
-# #
-##############################################################
-
-import math
-import numbers
-import numpy as np
-import pandas as pd
-from scipy.stats import norm
-from scipy import stats
-import sys
-from sklearn.neighbors import DistanceMetric
-import sklearn
-
-
-
-# Class defining the distance metrics that are not available as standard ones....
-class InstanceDistanceMetrics:
-
- # S for gowers distance
- def s(self, val1, val2, range):
- # If we compare numbers we look at the difference and normalize.
- if isinstance(val1, numbers.Number) and isinstance(val1, numbers.Number):
- return 1 - (float(abs(val1-val2))/range)
- # If we compare something else, we just look at whether they are equal.
- else:
- if val1 == val2:
- return 1
- else:
- return 0
-
- # Delta for gowers distance.
- def delta(self, val1, val2):
- # Check whether both values are known (i.e. nan), if so the delta is 1, 0 otherwise.
- if (not np.isnan(val1)) and (not np.isnan(val2)):
- return 1
- return 0
-
- # Define gowers distance between two rows, given the ranges of the variables
- # over the entire dataset (over all columns in row1 and row2)
- def gowers_similarity(self, data_row1, data_row2, ranges):
- # We cannot computer if the lengths are not equal.
- if len(data_row1.columns) != len(data_row2.columns):
- return -1
-
- delta_total = 0
- s_total = 0
-
- # iterate over all columns.
- for i in range(0, len(data_row1.columns)):
- val1 = data_row1[data_row1.columns[i]].values[0]
- val2 = data_row2[data_row2.columns[i]].values[0]
- # compute the delta
- delta = self.delta(val1, val2)
- delta_total = delta_total + delta
- if delta > 0:
- # and compute the s if the delta is above 0.
- s_total = s_total + self.s(val1, val2, ranges[i])
- return float(s_total)/delta_total
-
-# Class to flatten datasets or compute the statistical difference between cases.
-class PersonDistanceMetricsNoOrdering:
-
- gower = 'gower'
- minkowski = 'minkowski'
-
- # This returns a dataset with aggregated data instances based on the mean values
- # in the rows.
- def create_instances_mean(self, datasets):
- index = range(0, len(datasets))
- cols = datasets[0].columns
- new_dataset = pd.DataFrame(index=index, columns=cols)
-
- for i in range(0, len(datasets)):
- for col in cols:
- # Compute the mean per column and assign that
- # value for the row representing the current
- # dataset.
- new_dataset.ix[i, col] = datasets[i][col].mean()
-
- return new_dataset
-
- # Fit datasets to normal distribution and use parameters as instances
- def create_instances_normal_distribution(self, datasets):
- index = range(0, len(datasets))
- cols = datasets[0].columns
- new_cols = []
- # Create new columns for the parameters of the distribution.
- for col in cols:
- new_cols.append(col + '_mu')
- new_cols.append(col + '_sigma')
- new_dataset = pd.DataFrame(index=index, columns=new_cols)
-
- for i in range(0, len(datasets)):
- for col in cols:
- # Fit the distribution and assign the values to the
- # row representing the dataset.
- mu, sigma = norm.fit(datasets[i][col])
- new_dataset.ix[i, col + '_mu'] = mu
- new_dataset.ix[i, col + '_sigma'] = sigma
-
- return new_dataset
-
- # This defines the distance between datasets based on the statistical
- # differences between the distribution we can only compute
- # distances pairwise.
- def p_distance(self, dataset1, dataset2):
-
- cols = dataset1.columns
- distance = 0
- for col in cols:
- D, p_value = stats.ks_2samp(dataset1[col], dataset2[col])
- distance= distance + (1-p_value)
- return distance
-
-# Class to compare two time ordered datasets.
-class PersonDistanceMetricsOrdering:
-
- extreme_value = sys.float_info.max
- tiny_value = 0.000001
-
- # Directly pair up the datasets and computer the euclidean
- # distances between the sequences of values.
- def euclidean_distance(self, dataset1, dataset2):
- dist = DistanceMetric.get_metric('euclidean')
- if not len(dataset1.index) == len(dataset2.index):
- return -1
- distance = 0
-
- for i in range(0, len(dataset1.index)):
- data_row1 = dataset1.iloc[:,i:i+1].transpose()
- data_row2 = dataset2.iloc[:,i:i+1].transpose()
- ecl_dist = dist.pairwise(data_row1, data_row2)
- distance = distance + ecl_dist
-
- return distance
-
- # Compute the distance between two datasets given a set lag.
- def lag_correlation_given_lag(self, dataset1, dataset2, lag):
- distance = 0
- for i in range(0, len(dataset1.columns)):
- # consider the lengths of the series, and compare the
- # number of points in the smallest series.
- length_ds1 = len(dataset1.index)
- length_ds2 = len(dataset2.index) - lag
- length_used = min(length_ds1, length_ds2)
- if length_used < 1:
- return self.extreme_value
- # We multiply the values as expressed in the book.
- ccc = np.multiply(dataset1.ix[0:length_used, i].values, dataset2.ix[lag:length_used+lag, i].values)
- # We add the sum of the mutliplications to the distance. Correct for the difference in length.
- distance = distance + (float(1)/(float(max(ccc.sum(), self.tiny_value))))/length_used
- return distance
-
- # Compute the lag correlation. For this we find the best lag.
- def lag_correlation(self, dataset1, dataset2, max_lag):
- best_dist = -1
- best_lag = 0
- for i in range(0, max_lag+1):
- # Compute the distance given a lag.
- current_dist = self.lag_correlation_given_lag(dataset1, dataset2, i)
- if current_dist < best_dist or best_dist == -1:
- best_dist = current_dist
- best_lag = i
- return best_dist
-
- # Simple implementation of the dtw. Note that we use the euclidean distance here..
- # The implementation follows the algorithm explained in the book very closely.
- def dynamic_time_warping(self, dataset1, dataset2):
- # Create a distance matrix between all time points.
- cheapest_path = np.full((len(dataset1.index), len(dataset2.index)), self.extreme_value)
- cheapest_path[0,0] = 0
- DM = InstanceDistanceMetrics()
-
-
- for i in range(1, len(dataset1.index)):
- for j in range(1, len(dataset2.index)):
- data_row1 = dataset1.iloc[i:i+1,:]
- data_row2 = dataset2.iloc[j:j+1,:]
- d = sklearn.metrics.pairwise.euclidean_distances(data_row1, data_row2)
- cheapest_path[i,j] = d + min(cheapest_path[i-1, j], cheapest_path[i, j-1], cheapest_path[i-1, j-1])
- return cheapest_path[len(dataset1.index)-1, len(dataset2.index)-1]
-
diff --git a/PythonCode/Chapter5/__init__.py b/PythonCode/Chapter5/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/PythonCode/Chapter7/Evaluation.py b/PythonCode/Chapter7/Evaluation.py
deleted file mode 100644
index 791ea3f4..00000000
--- a/PythonCode/Chapter7/Evaluation.py
+++ /dev/null
@@ -1,70 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 7 #
-# #
-##############################################################
-
-from sklearn import metrics
-import pandas as pd
-import numpy as np
-import math
-
-# Class for evaluation metrics of classification problems.
-class ClassificationEvaluation:
-
- # Returns the accuracy given the true and predicted values.
- def accuracy(self, y_true, y_pred):
- return metrics.accuracy_score(y_true, y_pred)
-
- # Returns the precision given the true and predicted values.
- # Note that it returns the precision per class.
- def precision(self, y_true, y_pred):
- return metrics.precision_score(y_true, y_pred, average=None)
-
- # Returns the recall given the true and predicted values.
- # Note that it returns the recall per class.
- def recall(self, y_true, y_pred):
- return metrics.recall_score(y_true, y_pred, average=None)
-
- # Returns the f1 given the true and predicted values.
- # Note that it returns the recall per class.
- def f1(self, y_true, y_pred):
- return metrics.f1_score(y_true, y_pred, average=None)
-
- # Returns the area under the curve given the true and predicted values.
- # Note: we expect a binary classification problem here(!)
- def auc(self, y_true, y_pred_prob):
- return metrics.roc_auc_score(y_true, y_pred_prob)
-
- # Returns the confusion matrix given the true and predicted values.
- def confusion_matrix(self, y_true, y_pred, labels):
- return metrics.confusion_matrix(y_true, y_pred, labels=labels)
-
-# Class for evaluation metrics of regression problems.
-class RegressionEvaluation:
-
- # Returns the mean squared error between the true and predicted values.
- def mean_squared_error(self, y_true, y_pred):
- return metrics.mean_squared_error(y_true, y_pred)
-
- # Returns the mean squared error between the true and predicted values.
- def mean_squared_error_with_std(self, y_true, y_pred):
- y_true = np.array(y_true)
- y_pred = np.array(y_pred)
- errors = np.square(y_true-y_pred)
- mse = errors.mean()
- std = errors.std()
- return mse.mean(), std.mean()
-
- # Returns the mean absolute error between the true and predicted values.
- def mean_absolute_error(self, y_true, y_pred):
- return metrics.mean_absolute_error(y_true, y_pred)
-
- # Return the mean absolute error between the true and predicted values
- # as well as its standard deviation.
- def mean_absolute_error_with_std(self, y_true, y_pred):
- errors = np.absolute((y_pred - y_true))
- return errors.mean(), errors.std()
diff --git a/PythonCode/Chapter7/FeatureSelection.py b/PythonCode/Chapter7/FeatureSelection.py
deleted file mode 100644
index 6e4e56ca..00000000
--- a/PythonCode/Chapter7/FeatureSelection.py
+++ /dev/null
@@ -1,193 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 7 #
-# #
-##############################################################
-
-from Chapter7.LearningAlgorithms import ClassificationAlgorithms
-from Chapter7.Evaluation import ClassificationEvaluation
-from Chapter7.LearningAlgorithms import RegressionAlgorithms
-from Chapter7.Evaluation import RegressionEvaluation
-from scipy.stats import pearsonr
-import sys
-import copy
-import numpy as np
-from operator import itemgetter
-import pandas as pd
-
-# Specifies feature selection approaches for classification to identify the most important features.
-class FeatureSelectionClassification:
-
- # Forward selection for classification which selects a pre-defined number of features (max_features)
- # that show the best accuracy. We assume a decision tree learning for this purpose, but
- # this can easily be changed. It return the best features.
- def forward_selection(self, max_features, X_train, y_train):
- # Start with no features.
- ordered_features = []
- ordered_scores = []
- selected_features = []
- ca = ClassificationAlgorithms()
- ce = ClassificationEvaluation()
- prev_best_perf = 0
-
- # Select the appropriate number of features.
- for i in range(0, max_features):
- print i
-
- #Determine the features left to select.
- features_left = list(set(X_train.columns) - set(selected_features))
- best_perf = 0
- best_attribute = ''
-
- # For all features we can still select...
- for f in features_left:
- temp_selected_features = copy.deepcopy(selected_features)
- temp_selected_features.append(f)
-
- # Determine the accuracy of a decision tree learner if we were to add
- # the feature.
- pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features])
- perf = ce.accuracy(y_train, pred_y_train)
-
- # If the performance is better than what we have seen so far (we aim for high accuracy)
- # we set the current feature to the best feature and the same for the best performance.
- if perf > best_perf:
- best_perf = perf
- best_feature = f
- # We select the feature with the best performance.
- selected_features.append(best_feature)
- prev_best_perf = best_perf
- ordered_features.append(best_feature)
- ordered_scores.append(best_perf)
- return selected_features, ordered_features, ordered_scores
-
- # Backward selection for classification which selects a pre-defined number of features (max_features)
- # that show the best accuracy. We assume a decision tree learning for this purpose, but
- # this can easily be changed. It return the best features.
- def backward_selection(self, max_features, X_train, y_train):
- # First select all features.
- selected_features = X_train.columns.tolist()
- ca = ClassificationAlgorithms()
- ce = ClassificationEvaluation()
- for i in range(0, (len(X_train.columns) - max_features)):
- best_perf = 0
- worst_feature = ''
-
- # Select from the features that are still in the selection.
- for f in selected_features:
- temp_selected_features = copy.deepcopy(selected_features)
- temp_selected_features.remove(f)
-
- # Determine the score without the feature.
- pred_y_train, pred_y_test, prob_training_y, prob_test_y = ca.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features])
- perf = ce.accuracy(y_train, pred_y_train)
-
- # If we score better without the feature than what we have seen so far
- # this is the worst feature.
- if perf > best_perf:
- best_perf = perf
- worst_feature = f
-
- # Remove the worst feature.
- selected_features.remove(worst_feature)
- return selected_features
-
-# Specifies feature selection approaches for classification to identify the most important features.
-class FeatureSelectionRegression:
-
- # Forward selection for classification which selects a pre-defined number of features (max_features)
- # that show the best accuracy. We assume a decision tree learning for this purpose, but
- # this can easily be changed. It return the best features.
- def forward_selection(self, max_features, X_train, y_train):
- ordered_features = []
- ordered_scores = []
-
- # Start with no features.
- selected_features = []
- ra = RegressionAlgorithms()
- re = RegressionEvaluation()
- prev_best_perf = sys.float_info.max
-
- # Select the appropriate number of features.
- for i in range(0, max_features):
-
- #Determine the features left to select.
- features_left = list(set(X_train.columns) - set(selected_features))
- best_perf = sys.float_info.max
- best_feature = ''
-
- # For all features we can still select...
- for f in features_left:
- temp_selected_features = copy.deepcopy(selected_features)
- temp_selected_features.append(f)
-
- # Determine the mse of a decision tree learner if we were to add
- # the feature.
- pred_y_train, pred_y_test = ra.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features])
- perf = re.mean_squared_error(y_train, pred_y_train)
-
- # If the performance is better than what we have seen so far (we aim for low mse)
- # we set the current feature to the best feature and the same for the best performance.
- if perf < best_perf:
- best_perf = perf
- best_feature = f
- # We select the feature with the best performance.
- selected_features.append(best_feature)
- prev_best_perf = best_perf
- ordered_features.append(best_feature)
- ordered_scores.append(best_perf)
- return selected_features, ordered_features, ordered_scores
-
- # Backward selection for classification which selects a pre-defined number of features (max_features)
- # that show the best accuracy. We assume a decision tree learning for this purpose, but
- # this can easily be changed. It return the best features.
- def backward_selection(self, max_features, X_train, y_train):
-
- # First select all features.
- selected_features = X_train.columns.tolist()
- ra = RegressionAlgorithms()
- re = RegressionEvaluation()
-
- # Select from the features that are still in the selection.
- for i in range(0, (len(X_train.columns) - max_features)):
- best_perf = sys.float_info.max
- worst_feature = ''
- for f in selected_features:
- temp_selected_features = copy.deepcopy(selected_features)
- temp_selected_features.remove(f)
-
- # Determine the score without the feature.
- pred_y_train, pred_y_test = ra.decision_tree(X_train[temp_selected_features], y_train, X_train[temp_selected_features])
- perf = re.mean_squared_error(y_train, pred_y_train)
- # If we score better (i.e. a lower mse) without the feature than what we have seen so far
- # this is the worst feature.
- if perf < best_perf:
- best_perf = perf
- worst_feature = f
- # Remove the worst feature.
- selected_features.remove(worst_feature)
- return selected_features
-
- # Select features based upon the correlation through the Pearson coefficient.
- # It return the max_features best features.
- def pearson_selection(self, max_features, X_train, y_train):
- correlations = []
- full_columns_and_corr = []
- abs_columns_and_corr = []
-
- # Compute the absolute correlations per column.
- for i in range(0, len(X_train.columns)):
- corr, p = pearsonr(X_train[X_train.columns[i]], y_train)
- correlations.append(abs(corr))
- if np.isfinite(corr):
- full_columns_and_corr.append((X_train.columns[i], corr))
- abs_columns_and_corr.append((X_train.columns[i], abs(corr)))
-
- sorted_attributes = sorted(abs_columns_and_corr,key=itemgetter(1), reverse=True)
- res_list = [x[0] for x in sorted_attributes[0:max_features]]
-
- # And return the most correlated ones.
- return res_list, sorted(full_columns_and_corr,key=itemgetter(1), reverse=True)
\ No newline at end of file
diff --git a/PythonCode/Chapter7/LearningAlgorithms.py b/PythonCode/Chapter7/LearningAlgorithms.py
deleted file mode 100644
index 30f26960..00000000
--- a/PythonCode/Chapter7/LearningAlgorithms.py
+++ /dev/null
@@ -1,454 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 7 #
-# #
-##############################################################
-
-from sklearn.neural_network import MLPClassifier
-from sklearn.neural_network import MLPRegressor
-from sklearn.svm import SVC
-from sklearn.svm import LinearSVC
-from sklearn.svm import SVR
-from sklearn.svm import LinearSVR
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.neighbors import KNeighborsRegressor
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.tree import DecisionTreeRegressor
-from sklearn import tree
-from sklearn.naive_bayes import GaussianNB
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.ensemble import RandomForestRegressor
-from sklearn.model_selection import GridSearchCV
-import pandas as pd
-import numpy as np
-
-class ClassificationAlgorithms:
-
- # Apply a neural network for classification upon the training data (with the specified composition of
- # hidden layers and number of iterations), and use the created network to predict the outcome for both the
- # test and training set. It returns the categorical predictions for the training and test set as well as the
- # probabilities associated with each class, each class being represented as a column in the data frame.
- def feedforward_neural_network(self, train_X, train_y, test_X, hidden_layer_sizes=(100,), max_iter=500, activation='logistic', alpha=0.0001, learning_rate='adaptive', gridsearch=True, print_model_details=False):
-
-
- if gridsearch:
- tuned_parameters = [{'hidden_layer_sizes': [(5,), (10,), (25,), (100,), (100,5,), (100,10,),], 'activation': [activation],
- 'learning_rate': [learning_rate], 'max_iter': [1000, 2000], 'alpha': [alpha]}]
- nn = GridSearchCV(MLPClassifier(), tuned_parameters, cv=5, scoring='accuracy')
- else:
- # Create the model
- nn = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, max_iter=max_iter, learning_rate=learning_rate, alpha=alpha)
-
- # Fit the model
- nn.fit(train_X, train_y.values.ravel())
-
- if gridsearch and print_model_details:
- print nn.best_params_
-
- if gridsearch:
- nn = nn.best_estimator_
-
- # Apply the model
- pred_prob_training_y = nn.predict_proba(train_X)
- pred_prob_test_y = nn.predict_proba(test_X)
- pred_training_y = nn.predict(train_X)
- pred_test_y = nn.predict(test_X)
- frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=nn.classes_)
- frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=nn.classes_)
-
- return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y
-
- # Apply a support vector machine for classification upon the training data (with the specified value for
- # C, epsilon and the kernel function), and use the created model to predict the outcome for both the
- # test and training set. It returns the categorical predictions for the training and test set as well as the
- # probabilities associated with each class, each class being represented as a column in the data frame.
- def support_vector_machine_with_kernel(self, train_X, train_y, test_X, kernel='rbf', C=1, gamma=1e-3, gridsearch=True, print_model_details=False):
- # Create the model
- if gridsearch:
- tuned_parameters = [{'kernel': ['rbf', 'poly'], 'gamma': [1e-3, 1e-4],
- 'C': [1, 10, 100]}]
- svm = GridSearchCV(SVC(probability=True), tuned_parameters, cv=5, scoring='accuracy')
- else:
- svm = SVC(C=C, kernel=kernel, gamma=gamma, probability=True, cache_size=7000)
-
- # Fit the model
- svm.fit(train_X, train_y.values.ravel())
-
- if gridsearch and print_model_details:
- print svm.best_params_
-
- if gridsearch:
- svm = svm.best_estimator_
-
- # Apply the model
- pred_prob_training_y = svm.predict_proba(train_X)
- pred_prob_test_y = svm.predict_proba(test_X)
- pred_training_y = svm.predict(train_X)
- pred_test_y = svm.predict(test_X)
- frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=svm.classes_)
- frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=svm.classes_)
-
- return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y
-
- # Apply a support vector machine for classification upon the training data (with the specified value for
- # C, epsilon and the kernel function), and use the created model to predict the outcome for both the
- # test and training set. It returns the categorical predictions for the training and test set as well as the
- # probabilities associated with each class, each class being represented as a column in the data frame.
- def support_vector_machine_without_kernel(self, train_X, train_y, test_X, C=1, tol=1e-3, max_iter=1000, gridsearch=True, print_model_details=False):
- # Create the model
- if gridsearch:
- tuned_parameters = [{'max_iter': [1000, 2000], 'tol': [1e-3, 1e-4],
- 'C': [1, 10, 100]}]
- svm = GridSearchCV(LinearSVC(), tuned_parameters, cv=5, scoring='accuracy')
- else:
- svm = LinearSVC(C=C, tol=tol, max_iter=max_iter)
-
- # Fit the model
- svm.fit(train_X, train_y.values.ravel())
-
- if gridsearch and print_model_details:
- print svm.best_params_
-
- if gridsearch:
- svm = svm.best_estimator_
-
- # Apply the model
-
- distance_training_platt = 1/(1+np.exp(svm.decision_function(train_X)))
- pred_prob_training_y = distance_training_platt / distance_training_platt.sum(axis=1)[:,None]
- distance_test_platt = 1/(1+np.exp(svm.decision_function(test_X)))
- pred_prob_test_y = distance_test_platt / distance_test_platt.sum(axis=1)[:,None]
- pred_training_y = svm.predict(train_X)
- pred_test_y = svm.predict(test_X)
- frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=svm.classes_)
- frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=svm.classes_)
-
- return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y
-
-
- # Apply a nearest neighbor approach for classification upon the training data (with the specified value for
- # k), and use the created model to predict the outcome for both the
- # test and training set. It returns the categorical predictions for the training and test set as well as the
- # probabilities associated with each class, each class being represented as a column in the data frame.
- def k_nearest_neighbor(self, train_X, train_y, test_X, n_neighbors=5, gridsearch=True, print_model_details=False):
- # Create the model
- if gridsearch:
- tuned_parameters = [{'n_neighbors': [1, 2, 5, 10]}]
- knn = GridSearchCV(KNeighborsClassifier(), tuned_parameters, cv=5, scoring='accuracy')
- else:
- knn = KNeighborsClassifier(n_neighbors=n_neighbors)
-
- # Fit the model
- knn.fit(train_X, train_y.values.ravel())
-
- if gridsearch and print_model_details:
- print knn.best_params_
-
- if gridsearch:
- knn = knn.best_estimator_
-
- # Apply the model
- pred_prob_training_y = knn.predict_proba(train_X)
- pred_prob_test_y = knn.predict_proba(test_X)
- pred_training_y = knn.predict(train_X)
- pred_test_y = knn.predict(test_X)
- frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=knn.classes_)
- frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=knn.classes_)
-
- return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y
-
- # Apply a decision tree approach for classification upon the training data (with the specified value for
- # the minimum samples in the leaf, and the export path and files if print_model_details=True)
- # and use the created model to predict the outcome for both the
- # test and training set. It returns the categorical predictions for the training and test set as well as the
- # probabilities associated with each class, each class being represented as a column in the data frame.
- def decision_tree(self, train_X, train_y, test_X, min_samples_leaf=50, criterion='gini', print_model_details=False, export_tree_path='Example_graphs/Chapter7/', export_tree_name='tree.dot', gridsearch=True):
- # Create the model
- if gridsearch:
- tuned_parameters = [{'min_samples_leaf': [2, 10, 50, 100, 200],
- 'criterion':['gini', 'entropy']}]
- dtree = GridSearchCV(DecisionTreeClassifier(), tuned_parameters, cv=5, scoring='accuracy')
- else:
- dtree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, criterion=criterion)
-
- # Fit the model
-
- dtree.fit(train_X, train_y.values.ravel())
-
- if gridsearch and print_model_details:
- print dtree.best_params_
-
- if gridsearch:
- dtree = dtree.best_estimator_
-
- # Apply the model
- pred_prob_training_y = dtree.predict_proba(train_X)
- pred_prob_test_y = dtree.predict_proba(test_X)
- pred_training_y = dtree.predict(train_X)
- pred_test_y = dtree.predict(test_X)
- frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=dtree.classes_)
- frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=dtree.classes_)
-
- if print_model_details:
- ordered_indices = [i[0] for i in sorted(enumerate(dtree.feature_importances_), key=lambda x:x[1], reverse=True)]
- print 'Feature importance decision tree:'
- for i in range(0, len(dtree.feature_importances_)):
- print train_X.columns[ordered_indices[i]],
- print ' & ',
- print dtree.feature_importances_[ordered_indices[i]]
- tree.export_graphviz(dtree, out_file=export_tree_path + export_tree_name, feature_names=train_X.columns, class_names=dtree.classes_)
-
- return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y
-
- # Apply a naive bayes approach for classification upon the training data
- # and use the created model to predict the outcome for both the
- # test and training set. It returns the categorical predictions for the training and test set as well as the
- # probabilities associated with each class, each class being represented as a column in the data frame.
- def naive_bayes(self, train_X, train_y, test_X):
- # Create the model
- nb = GaussianNB()
-
- # Fit the model
- nb.fit(train_X, train_y)
-
- # Apply the model
- pred_prob_training_y = nb.predict_proba(train_X)
- pred_prob_test_y = nb.predict_proba(test_X)
- pred_training_y = nb.predict(train_X)
- pred_test_y = nb.predict(test_X)
- frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=nb.classes_)
- frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=nb.classes_)
-
- return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y
-
- # Apply a random forest approach for classification upon the training data (with the specified value for
- # the minimum samples in the leaf, the number of trees, and if we should print some of the details of the
- # model print_model_details=True) and use the created model to predict the outcome for both the
- # test and training set. It returns the categorical predictions for the training and test set as well as the
- # probabilities associated with each class, each class being represented as a column in the data frame.
- def random_forest(self, train_X, train_y, test_X, n_estimators=10, min_samples_leaf=5, criterion='gini', print_model_details=False, gridsearch=True):
-
- if gridsearch:
- tuned_parameters = [{'min_samples_leaf': [2, 10, 50, 100, 200],
- 'n_estimators':[10, 50, 100],
- 'criterion':['gini', 'entropy']}]
- rf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='accuracy')
- else:
- rf = RandomForestClassifier(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, criterion=criterion)
-
- # Fit the model
-
- rf.fit(train_X, train_y.values.ravel())
-
- if gridsearch and print_model_details:
- print rf.best_params_
-
- if gridsearch:
- rf = rf.best_estimator_
-
- pred_prob_training_y = rf.predict_proba(train_X)
- pred_prob_test_y = rf.predict_proba(test_X)
- pred_training_y = rf.predict(train_X)
- pred_test_y = rf.predict(test_X)
- frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=rf.classes_)
- frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=rf.classes_)
-
- if print_model_details:
- ordered_indices = [i[0] for i in sorted(enumerate(rf.feature_importances_), key=lambda x:x[1], reverse=True)]
- print 'Feature importance random forest:'
- for i in range(0, len(rf.feature_importances_)):
- print train_X.columns[ordered_indices[i]],
- print ' & ',
- print rf.feature_importances_[ordered_indices[i]]
-
- return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y
-
-class RegressionAlgorithms:
-
- # Apply a neural network for regression upon the training data (with the specified composition of
- # hidden layers and number of iterations), and use the created network to predict the outcome for both the
- # test and training set. It returns the categorical numerical predictions for the training and test set.
- def feedforward_neural_network(self, train_X, train_y, test_X, hidden_layer_sizes=(100,), max_iter=500, activation='identity', learning_rate='adaptive', gridsearch=True, print_model_details=False):
- if gridsearch:
- tuned_parameters = [{'hidden_layer_sizes': [(5,), (10,), (25,), (100,), (100,5,), (100,10,),], 'activation': ['identity'],
- 'learning_rate': ['adaptive'], 'max_iter': [1000, 2000]}]
- nn = GridSearchCV(MLPRegressor(), tuned_parameters, cv=5, scoring='neg_mean_squared_error')
- else:
- # Create the model
- nn = MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, activation=activation, max_iter=max_iter, learning_rate=learning_rate)
-
- # Fit the model
- nn.fit(train_X, train_y)
-
- if gridsearch and print_model_details:
- print rf.best_params_
-
- if gridsearch:
- nn = nn.best_estimator_
-
- # Apply the model
- pred_training_y = nn.predict(train_X)
- pred_test_y = nn.predict(test_X)
-
- return pred_training_y, pred_test_y
-
- # Apply a support vector machine with a given kernel function for regression upon the training data (with the specified value for
- # C, gamma and the kernel function), and use the created model to predict the outcome for both the
- # test and training set. It returns the predictions for the training and test set.
- def support_vector_regression_with_kernel(self, train_X, train_y, test_X, kernel='rbf', C=1, gamma=1e-3, gridsearch=True, print_model_details=False):
- if gridsearch:
- tuned_parameters = [{'kernel': ['rbf', 'poly'], 'gamma': [1e-3, 1e-4],
- 'C': [1, 10, 100]}]
- svr = GridSearchCV(SVR(), tuned_parameters, cv=5, scoring='neg_mean_squared_error')
- else:
- # Create the model
- svr = SVR(C=C, kernel=kernel, gamma=gamma)
-
- # Fit the model
- svr.fit(train_X, train_y)
-
- if gridsearch and print_model_details:
- print svr.best_params_
-
- if gridsearch:
- svr = svr.best_estimator_
-
- # Apply the model
- pred_training_y = svr.predict(train_X)
- pred_test_y = svr.predict(test_X)
-
- return pred_training_y, pred_test_y
-
- # Apply a support vector machine without a complex kernel function for regression upon the training data (with the specified value for
- # C, tolerance and max iterations), and use the created model to predict the outcome for both the
- # test and training set. It returns the predictions for the training and test set.
- def support_vector_regression_without_kernel(self, train_X, train_y, test_X, C=1, tol=1e-3, max_iter=1000, gridsearch=True, print_model_details=False):
- if gridsearch:
- tuned_parameters = [{'max_iter': [1000, 2000], 'tol': [1e-3, 1e-4],
- 'C': [1, 10, 100]}]
- svr = GridSearchCV(LinearSVR(), tuned_parameters, cv=5, scoring='neg_mean_squared_error')
- else:
- # Create the model
- svr = LinearSVR(C=C, kernel=kernel, tol=tol, max_iter=max_iter)
-
- # Fit the model
- svr.fit(train_X, train_y)
-
- if gridsearch and print_model_details:
- print svr.best_params_
-
- if gridsearch:
- svr = svr.best_estimator_
-
- # Apply the model
- pred_training_y = svr.predict(train_X)
- pred_test_y = svr.predict(test_X)
-
- return pred_training_y, pred_test_y
-
- # Apply a nearest neighbor approach for regression upon the training data (with the specified value for
- # k), and use the created model to predict the outcome for both the
- # test and training set. It returns the predictions for the training and test set.
- def k_nearest_neighbor(self, train_X, train_y, test_X, n_neighbors=5, gridsearch=True, print_model_details=False):
- # Create the model
- if gridsearch:
- tuned_parameters = [{'n_neighbors': [1, 2, 5, 10]}]
- knn = GridSearchCV(KNeighborsRegressor(), tuned_parameters, cv=5, scoring='neg_mean_squared_error')
- else:
- # Create the model
- knn = KNeighborsRegressor(n_neighbors=n_neighbors)
-
- # Fit the model
- knn.fit(train_X, train_y)
-
- if gridsearch and print_model_details:
- print knn.best_params_
-
- if gridsearch:
- knn = knn.best_estimator_
-
- # Apply the model
- pred_training_y = knn.predict(train_X)
- pred_test_y = knn.predict(test_X)
-
- return pred_training_y, pred_test_y
-
- # Apply a decision tree approach for regression upon the training data (with the specified value for
- # the minimum samples in the leaf, and the export path and files if print_model_details=True)
- # and use the created model to predict the outcome for both the
- # test and training set. It returns the predictions for the training and test set.
- def decision_tree(self, train_X, train_y, test_X, min_samples_leaf=50, criterion='mse', print_model_details=False, export_tree_path='Example_graphs/Chapter7/', export_tree_name='tree.dot', gridsearch=True):
- # Create the model
- if gridsearch:
- tuned_parameters = [{'min_samples_leaf': [2, 10, 50, 100, 200],
- 'criterion':['mse']}]
- dtree = GridSearchCV(DecisionTreeRegressor(), tuned_parameters, cv=5, scoring='neg_mean_squared_error')
- else:
- # Create the model
- dtree = DecisionTreeRegressor(min_samples_leaf=min_samples_leaf, criterion=criterion)
-
- # Fit the model
- dtree.fit(train_X, train_y)
-
- if gridsearch and print_model_details:
- print dtree.best_params_
-
- if gridsearch:
- dtree = dtree.best_estimator_
-
- # Apply the model
- pred_training_y = dtree.predict(train_X)
- pred_test_y = dtree.predict(test_X)
-
- if print_model_details:
- print 'Feature importance decision tree:'
- ordered_indices = [i[0] for i in sorted(enumerate(dtree.feature_importances_), key=lambda x:x[1], reverse=True)]
- for i in range(0, len(dtree.feature_importances_)):
- print train_X.columns[ordered_indices[i]],
- print ' & ',
- print dtree.feature_importances_[ordered_indices[i]]
- tree.export_graphviz(dtree, out_file=export_tree_path + export_tree_name, feature_names=train_X.columns, class_names=dtree.classes_)
-
- return pred_training_y, pred_test_y
-
- # Apply a random forest approach for regression upon the training data (with the specified value for
- # the minimum samples in the leaf, the number of trees, and if we should print some of the details of the
- # model print_model_details=True) and use the created model to predict the outcome for both the
- # test and training set. It returns the predictions for the training and test set.
- def random_forest(self, train_X, train_y, test_X, n_estimators=10, min_samples_leaf=5, criterion='mse', print_model_details=False, gridsearch=True):
-
- if gridsearch:
- tuned_parameters = [{'min_samples_leaf': [2, 10, 50, 100, 200],
- 'n_estimators':[10, 50, 100],
- 'criterion':['mse']}]
- rf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring='neg_mean_squared_error')
- else:
- # Create the model
- rf = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, criterion=criterion)
-
- # Fit the model
- rf.fit(train_X, train_y)
-
- if gridsearch and print_model_details:
- print rf.best_params_
-
- if gridsearch:
- rf = rf.best_estimator_
-
- # Apply the model
- pred_training_y = rf.predict(train_X)
- pred_test_y = rf.predict(test_X)
-
- if print_model_details:
- print 'Feature importance random forest:'
- ordered_indices = [i[0] for i in sorted(enumerate(rf.feature_importances_), key=lambda x:x[1], reverse=True)]
-
- for i in range(0, len(rf.feature_importances_)):
- print train_X.columns[ordered_indices[i]],
- print ' & ',
- print rf.feature_importances_[ordered_indices[i]]
-
- return pred_training_y, pred_test_y
diff --git a/PythonCode/Chapter7/PrepareDatasetForLearning.py b/PythonCode/Chapter7/PrepareDatasetForLearning.py
deleted file mode 100644
index 8d42bfed..00000000
--- a/PythonCode/Chapter7/PrepareDatasetForLearning.py
+++ /dev/null
@@ -1,194 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 7 #
-# #
-##############################################################
-
-from sklearn.model_selection import train_test_split
-import numpy as np
-import random
-import copy
-import pandas as pd
-
-# This class creates datasets that can be used by the learning algorithms. Up till now we have
-# assumed binary columns for each class, we will for instance introduce approaches to create
-# a single categorical attribute.
-class PrepareDatasetForLearning:
-
- default_label = 'undefined'
- class_col = 'class'
- person_col = 'person'
-
- # This function creates a single class column based on a set of binary class columns.
- # it essentially merges them. It removes the old label columns.
- def assign_label(self, dataset, class_labels):
- # Find which columns are relevant based on the possibly partial class_label
- # specification.
- labels = []
- for i in range(0, len(class_labels)):
- labels.extend([name for name in list(dataset.columns) if class_labels[i] == name[0:len(class_labels[i])]])
-
- # Determine how many class values are label as 'true' in our class columns.
- sum_values = dataset[labels].sum(axis=1)
- # Create a new 'class' column and set the value to the default class.
- dataset['class'] = self.default_label
- for i in range(0, len(dataset.index)):
- # If we have exactly one true class column, we can assign that value,
- # otherwise we keep the default class.
- if sum_values.ix[i,:] == 1:
- dataset.ix[i, self.class_col] = dataset.ix[i, labels].idxmax(axis=1)
- # And remove our old binary columns.
- dataset = dataset.drop(labels, axis=1)
- return dataset
-
- # Split a dataset of a single person for a classificaiton problem with the the specified class columns class_labels.
- # We can have multiple targets if we want. It assumes a list in 'class_labels'
- # If 'like' is specified in matching, we will merge the columns that contain the class_labels into a single
- # columns. We can select a filter for rows where we are unable to identifty a unique
- # class and we can select whether we have a temporal dataset or not. In the former, we will select the first
- # training_frac of the data for training and the last 1-training_frac for testing. Otherwise, we select points randomly.
- # We return a training set, the labels of the training set, and the same for a test set. We can set the random seed
- # to make the split reproducible.
- def split_single_dataset_classification(self, dataset, class_labels, matching, training_frac, filter=True, temporal=False, random_state=0):
- # Create a single class column if we have the 'like' option.
- if matching == 'like':
- dataset = self.assign_label(dataset, class_labels)
- class_labels = self.class_col
- elif len(class_labels) == 1:
- class_labels = class_labels[0]
-
- # Filer NaN is desired and those for which we cannot determine the class should be removed.
- if filter:
- dataset = dataset.dropna()
- dataset = dataset[dataset['class'] != self.default_label]
-
- # The features are the ones not in the class label.
- features = [x for x in dataset.columns if x not in class_labels]
-
- # For temporal data, we select the desired fraction of training data from the first part
- # and use the rest as test set.
- if temporal:
- end_training_set = int(training_frac * len(dataset.index))
- training_set_X = dataset.ix[0:end_training_set, features]
- training_set_y = dataset.ix[0:end_training_set, class_labels]
- test_set_X = dataset.ix[end_training_set:len(dataset.index), features]
- test_set_y = dataset.ix[end_training_set:len(dataset.index), class_labels]
- print test_set_y
- # For non temporal data we use a standard function to randomly split the dataset.
- else:
- training_set_X, test_set_X, training_set_y, test_set_y = train_test_split(dataset.ix[:,features],
- dataset.ix[:,class_labels], test_size=(1-training_frac), stratify=dataset.ix[:,class_labels], random_state=random_state)
- return training_set_X, test_set_X, training_set_y, test_set_y
-
- def split_single_dataset_regression_by_time(self, dataset, target, start_training, end_training, end_test):
- training_instances = dataset[start_training:end_training]
- test_instances = dataset[end_training:end_test]
- train_y = copy.deepcopy(training_instances[target])
- test_y = copy.deepcopy(test_instances[target])
- train_X = training_instances
- del train_X[target]
- test_X = test_instances
- del test_X[target]
- return train_X, test_X, train_y, test_y
-
-
- # Split a dataset of a single person for a regression with the specified targets. We can
- # have multiple targets if we want. It assumes a list in 'targets'
- # We can select whether we have a temporal dataset or not. In the former, we will select the first
- # training_frac of the data for training and the last 1-training_frac for testing. Otherwise, we select points randomly.
- # We return a training set, the labels of the training set, and the same for a test set. We can set the random seed
- # to make the split reproducible.
- def split_single_dataset_regression(self, dataset, targets, training_frac, filter=False, temporal=False, random_state=0):
- # We just temporarily change some attribute values associated with the classification algorithm
- # and change them for numerical values. We then simply apply the classification variant of the
- # function.
- temp_default_label = self.default_label
- self.default_label = np.nan
- training_set_X, test_set_X, training_set_y, test_set_y = self.split_single_dataset_classification(dataset, targets, 'exact', training_frac, filter=filter, temporal=temporal, random_state=random_state)
- self.default_label = temp_default_label
- return training_set_X, test_set_X, training_set_y, test_set_y
-
- # If we have multiple overlapping indices (e.g. user 1 and user 2 have the same time stamps) our
- # series cannot me merged properly, therefore we can create a new index.
- def update_set(self, source_set, addition):
- if source_set is None:
- return addition
- else:
- # Check if the index is unique. If not, create a new index.
- if len(set(source_set.index) & set(addition.index)) > 0:
- return source_set.append(addition).reset_index(drop=True)
- else:
- return source_set.append(addition)
-
- # If we have multiple datasets representing different users and want to perform classification,
- # we do the same as we have seen for the single dataset
- # case. However, now we can in addition select what we would like to predict: do we want to perform well for an unknown
- # use (unknown_user=True) or for unseen data over all users. In the former, it return a training set containing
- # all data of training_frac users and test data for the remaining users. If the later, it return the training_frac
- # data of each user as a training set, and 1-training_frac data as a test set.
- def split_multiple_datasets_classification(self, datasets, class_labels, matching, training_frac, filter=False, temporal=False, unknown_users=False, random_state=0):
- training_set_X = None
- training_set_y = None
- test_set_X = None
- test_set_y = None
-
- # If we want to learn to predict well for unknown users.
- if unknown_users:
- # Shuffle the users we have.
- random.seed(random_state)
- indices = range(0, len(datasets))
- random.shuffle(indices)
- training_len = int(training_frac * len(datasets))
-
- # And select the data of the first fraction training_frac of users as the training set and the data of
- # the remaining users as test set.
- for i in range(0, training_len):
- # We use the single dataset function for classification and add it to the training data
- training_set_X_person, test_set_X_person, training_set_y_person, test_set_y_person = self.split_single_dataset_classification(datasets[indices[i]], class_labels, matching,
- 1, filter=filter, temporal=temporal, random_state=random_state)
- # We add a person column.
- training_set_X_person[self.person_col] = indices[i]
- training_set_X = self.update_set(training_set_X, training_set_X_person)
- training_set_y = self.update_set(training_set_y, training_set_y_person)
-
- for j in range(training_len, len(datasets)):
- # We use the single dataset function for classification and add it to the test data
- training_set_X_person, test_set_X_person, training_set_y_person, test_set_y_person = self.split_single_dataset_classification(datasets[indices[j]], class_labels, matching,
- 1, filter=filter, temporal=temporal, random_state=random_state)
- # We add a person column.
- training_set_X_person[self.person_col] = indices[j]
- test_set_X = self.update_set(test_set_X, training_set_X_person)
- test_set_y = self.update_set(test_set_y, training_set_y_person)
- else:
- init = True
- # Otherwise we split each dataset individually in a training and test set and add them.
- for i in range(0, len(datasets)):
- training_set_X_person, test_set_X_person, training_set_y_person, test_set_y_person = self.split_single_dataset_classification(datasets[i], class_labels, matching,
- training_frac, filter=filter, temporal=temporal, random_state=random_state)
- # We add a person column.
- training_set_X_person[self.person_col] = i
- test_set_X_person[self.person_col] = i
- training_set_X = self.update_set(training_set_X, training_set_X_person)
- training_set_y = self.update_set(training_set_y, training_set_y_person)
- test_set_X = self.update_set(test_set_X, test_set_X_person)
- test_set_y = self.update_set(test_set_y, test_set_y_person)
- return training_set_X, test_set_X, training_set_y, test_set_y
-
- # If we have multiple datasets representing different users and want to perform regression,
- # we do the same as we have seen for the single dataset
- # case. However, now we can in addition select what we would like to predict: do we want to perform well for an unknown
- # use (unknown_user=True) or for unseen data over all users. In the former, it return a training set containing
- # all data of training_frac users and test data for the remaining users. If the later, it return the training_frac
- # data of each user as a training set, and 1-training_frac data as a test set.
- def split_multiple_datasets_regression(self, datasets, targets, training_frac, filter=False, temporal=False, unknown_users=False, random_state=0):
- # We just temporarily change some attribute values associated with the regression algorithm
- # and change them for numerical values. We then simply apply the classification variant of the
- # function.
- temp_default_label = self.default_label
- self.default_label = np.nan
- training_set_X, test_set_X, training_set_y, test_set_y = self.split_multiple_datasets_classification(datasets, targets, 'exact', training_frac, filter=filter, temporal=temporal, unknown_users=unknown_users, random_state=random_state)
- self.default_label = temp_default_label
- return training_set_X, test_set_X, training_set_y, test_set_y
diff --git a/PythonCode/Chapter7/__init__.py b/PythonCode/Chapter7/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/PythonCode/Chapter8/LearningAlgorithmsTemporal.py b/PythonCode/Chapter8/LearningAlgorithmsTemporal.py
deleted file mode 100644
index aa7a673d..00000000
--- a/PythonCode/Chapter8/LearningAlgorithmsTemporal.py
+++ /dev/null
@@ -1,613 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 8 #
-# #
-##############################################################
-
-import pandas as pd
-import scipy.linalg
-import copy
-import random
-import numpy as np
-from scipy import linalg
-import inspyred
-from Chapter8.dynsys.Model import Model
-from Chapter8.dynsys.Evaluator import Evaluator
-from pybrain.structure import RecurrentNetwork
-from pybrain.structure import LinearLayer, SigmoidLayer, FullConnection
-from pybrain.datasets import SequentialDataSet
-from pybrain.supervised.trainers import BackpropTrainer
-from pybrain.supervised.trainers import RPropMinusTrainer
-from pybrain.tools.validation import testOnSequenceData
-from pybrain.tools.shortcuts import buildNetwork
-from Chapter7.Evaluation import ClassificationEvaluation
-from Chapter7.Evaluation import RegressionEvaluation
-import sys
-import matplotlib.pyplot as plot
-import pyflux as pf
-from statsmodels.tsa.arima_model import ARIMA
-
-
-
-# The class includes several algorithms that capture the temporal dimension explicitly for
-# classification problems.
-class TemporalClassificationAlgorithms:
-
- # This function converts a single dataset (no test or train split up) with possibly
- # categorical attributes to a numerical dataset, where categorical attributes are
- # taken as dummy variables (i.e. binary columns for each possible value).
- def create_numerical_single_dataset(self, dataset):
- return copy.deepcopy(pd.get_dummies(pd.DataFrame(dataset), prefix='', prefix_sep=''))
-
- # This function converts a train and test dataset with possibly
- # categorical attributes to a numerical dataset, where categorical attributes are
- # taken as dummy variables (i.e. binary columns for each possible value).
- def create_numerical_multiple_dataset(self, train, test):
-
- # Combine the two datasets as we want to include all possible values
- # for the categorical attribute.
- total_dataset = train.append(test)
-
- # Convert and split up again.
- total_dataset = pd.get_dummies(pd.DataFrame(total_dataset), prefix='', prefix_sep='')
- new_train = copy.deepcopy(total_dataset.iloc[0:len(train.index),:])
- new_test = copy.deepcopy(total_dataset.iloc[len(train.index):len(train.index)+len(test.index),:])
- return new_train, new_test
-
- # This function initializes an echo state network given the specified number of
- # inputs, outputs, and nodes in the reservoir. It returns the weight matrices W_in,
- # W, and W_back.
- def initialize_echo_state_network(self, inputs, outputs, reservoir):
-
- # http://minds.jacobs-university.de/mantas/code
- # Create random matrices.
- Win = (np.random.rand(reservoir,1+inputs)-0.5) * 1
- W = np.random.rand(reservoir,reservoir)-0.5
- Wback = (np.random.rand(reservoir,outputs)-0.5) * 1
-
- # Adjust W to "guarantee" the echo state property.
- rhoW = max(abs(linalg.eig(W)[0]))
- W *= 1.25 / rhoW
- return Win, W, Wback
-
- # Predict the values of an echo state network given the matrices Win, W, Wback, Wout, the setting for a,
- # the reservoir size, and the dataset (which potentially includes the target as well). The cols are
- # the relevant columns of X. Finally, per_time_step=True means that we feed to correct output back into
- # the network instead of our prediction (this requires a non empty y_true). It returns the predicted class
- # and probabilites per class in the form a pandas dataframe with a column per class value.
- # http://minds.jacobs-university.de/sites/default/files/uploads/mantas/code/minimalESN.py.txt
-
- def predict_values_echo_state_network(self, Win, W, Wback, Wout, a, reservoir_size, X, y_true, cols, per_time_step):
- # http://minds.jacobs-university.de/mantas/code
- # Set the initial activation to zero.
- x = np.zeros((reservoir_size,1))
- Y = []
-
- # Predict all time points.
- for t in range(0, len(X.index)):
-
- # Set the input according to X.
- u = X.ix[t,:].as_matrix()
-
- # If we have a previous time point
- if t > 0:
-
- # If we predict per time step, set the previous value
- # to the true previous value.
- if per_time_step:
- y_prev = y_true.ix[t-1,:].as_matrix()
-
- # Otherwise set it to the predicted value.
- else:
- y_prev= y
-
- # If we do not have a previous time point, set the values to 0.
- else:
- y_prev = np.array([0]*len(cols))
-
- # Compute the activation of the reservoir.
- x = (1-a)*x + a*np.tanh( np.dot( Win, np.vstack(np.insert(u,0,1)) ) + np.dot( W, x ) + np.dot( Wback, np.vstack(y_prev) ))
-
- # And the output.
- y = np.tanh( np.dot( Wout, np.hstack(np.insert(np.insert(x, 0, u), 0, 1)) ))
- Y.append(y)
- y_result = pd.DataFrame(Y, columns=cols, index=X.index)
- return y_result.idxmax(axis=1), y_result
-
- # Given a dictionary with an ordered list of parameter values to try, return
- # all possible combinations in the form of a list.
- def generate_parameter_combinations(self, parameter_dict, params):
- combinations = []
- if len(params) == 1:
- values = parameter_dict[params[0]]
- for val in values:
- combinations.append([val])
- return combinations
- else:
- params_without_first_element = copy.deepcopy(params)
- params_without_first_element.pop(0)
- params_without_first_element_combinations = self.generate_parameter_combinations(parameter_dict, params_without_first_element)
- values_first_element = parameter_dict[params[0]]
- for i in range(0, len(values_first_element)):
- for j in range(0, len(params_without_first_element_combinations)):
- list = [values_first_element[i]]
- list.extend(params_without_first_element_combinations[j])
- combinations.append(list)
- return combinations
-
- def gridsearch_reservoir_computing(self, train_X, train_y, test_X, test_y, per_time_step=False, error = 'mse', gridsearch_training_frac=0.7):
- tuned_parameters = {'a': [0.6, 0.8], 'reservoir_size':[400, 700, 1000]}
-# tuned_parameters = {'a': [0.4], 'reservoir_size':[250]}
- params = tuned_parameters.keys()
- combinations = self.generate_parameter_combinations(tuned_parameters, params)
- split_point = int(gridsearch_training_frac * len(train_X.index))
- train_params_X = train_X.ix[0:split_point,]
- test_params_X = train_X.ix[split_point:len(train_X.index),]
- train_params_y = train_y.ix[0:split_point,]
- test_params_y = train_y.ix[split_point:len(train_X.index),]
-
- if error == 'mse':
- best_error = sys.float_info.max
- elif error == 'accuracy':
- best_error = 0
-
- best_combination = []
- for comb in combinations:
- print comb
- # Order of the keys might have changed.
- keys = tuned_parameters.keys()
- pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob = self.reservoir_computing(train_params_X, train_params_y, test_params_X, test_params_y,
- reservoir_size=comb[keys.index('reservoir_size')], a=comb[keys.index('a')], per_time_step=per_time_step,
- gridsearch=False)
-
- if error == 'mse':
- eval = RegressionEvaluation()
- mse = eval.mean_squared_error(test_params_y, pred_test_y_prob)
- if mse < best_error:
- best_error = mse
- best_combination = comb
- elif error == 'accuracy':
- eval = ClassificationEvaluation()
- acc = eval.accuracy(test_params_y, pred_test_y)
- if acc > best_error:
- best_error = acc
- best_combination = comb
-
- print '-------'
- print best_combination
- print '-------'
- return best_combination[keys.index('reservoir_size')], best_combination[keys.index('a')]
-
- def normalize(self, train, test, range_min, range_max):
-
- total = copy.deepcopy(train).append(test, ignore_index=True)
-
- max = total.max()
- min = total.min()
- difference = max - min
- difference = difference.replace(0, 1)
-
- new_train = (((train - min)/difference) * (range_max - range_min)) + range_min
- new_test = (((test - min)/difference) * (range_max - range_min)) + range_min
- return new_train, new_test, min, max
-
-
- def denormalize(self, y, min, max, range_min, range_max):
- difference = max - min
- difference = difference.replace(0, 1)
-
- y = (y - range_min)/(range_max - range_min)
-
- return (y * difference) + min
-
- # Apply an echo state network for classification upon the training data (with the specified reservoir size),
- # and use the created network to predict the outcome for both the
- # test and training set. It returns the categorical predictions for the training and test set as well as the
- # probabilities associated with each class, each class being represented as a column in the data frame.
- def reservoir_computing(self, train_X, train_y, test_X, test_y, reservoir_size=100, a=0.8, per_time_step=False, gridsearch=True, gridsearch_training_frac=0.7, error='accuracy'):
- # Inspired by http://minds.jacobs-university.de/mantas/code
-
- if gridsearch:
- reservoir_size, a = self.gridsearch_reservoir_computing(train_X, train_y, test_X, test_y, per_time_step=per_time_step, gridsearch_training_frac=gridsearch_training_frac, error=error)
-
- # We assume these parameters as fixed, but feel free to change them as well.
- washout_period = 10
-
- # Create a numerical dataset without categorical attributes.
- new_train_X, new_test_X = self.create_numerical_multiple_dataset(train_X, test_X)
- if test_y is None:
- new_train_y = self.create_numerical_single_dataset(train_y)
- new_test_y = None
- else:
- new_train_y, new_test_y = self.create_numerical_multiple_dataset(train_y, test_y)
-
- # We normalize the input.....
- new_train_X, new_test_X, min_X, max_X = self.normalize(new_train_X, new_test_X, 0, 1)
- new_train_y, new_test_y, min_y, max_y = self.normalize(new_train_y, new_test_y, -0.9, 0.9)
-
- inputs = len(new_train_X.columns)
- outputs = len(new_train_y.columns)
-
- # Randomly initialize our weight vectors.
- Win, W, Wback = self.initialize_echo_state_network(inputs, outputs, reservoir_size)
-
-
- # Allocate memory for our result matrices.
- X = np.zeros((len(train_X.index)-washout_period, 1+inputs+reservoir_size))
- Yt = new_train_y.ix[washout_period:len(new_train_y.index),:].as_matrix()
- Yt = np.arctanh( Yt )
- x = np.zeros((reservoir_size,1))
-
- # Train over all time points.
- for t in range(0, len(new_train_X.index)):
-
- # Set the inputs according to the values seen in the training set.
- u = new_train_X.ix[t,:].as_matrix()
-
- # Set the previous target value to the real value if available.
- if t > 0:
- y_prev= new_train_y.ix[t-1,:].as_matrix()
- else:
- y_prev = np.array([0]*outputs)
-
- # Determine the activation of the reservoir.
- x = (1-a)*x + a*np.tanh( np.dot( Win, np.vstack(np.insert(u,0,1)) ) + np.dot( W, x ) + np.dot( Wback, np.vstack(y_prev) ))
-
- # And store the values obtained after the washout period.
- if t >= washout_period:
- X[t-washout_period,:] = np.hstack(np.insert(np.insert(x, 0, u), 0, 1))
-
-
- # Train Wout.
- X_p = linalg.pinv(X)
- Wout = np.transpose(np.dot( X_p, Yt ))
-
- # And predict for both training and test set.
-
-
- pred_train_y, pred_train_y_prob = self.predict_values_echo_state_network(Win, W, Wback, Wout, a, reservoir_size, new_train_X, new_train_y, new_train_y.columns, per_time_step)
- pred_test_y, pred_test_y_prob = self.predict_values_echo_state_network(Win, W, Wback, Wout, a, reservoir_size, new_test_X, new_test_y, new_train_y.columns, per_time_step)
-
- pred_train_y_prob = self.denormalize(pred_train_y_prob, min_y, max_y, -0.9, 0.9)
- pred_test_y_prob = self.denormalize(pred_test_y_prob, min_y, max_y, -0.9, 0.9)
-
- return pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob
-
- # Creates a recurrent neural network dataset according to the pybrain specification.
- # Returns this new format.
- def rnn_dataset(self, X, y):
-
- # Create an empty dataset.
- ds = SequentialDataSet(len(X.columns), len(y.columns))
-
- # And add all rows...
- for i in range(0, len(X.index)):
- ds.addSample(tuple(X.ix[i,:].values), tuple(y.ix[i,:].values))
- return ds
-
- # Do a gridsearch for the recurrent neural network...
- def gridsearch_recurrent_neural_network(self, train_X, train_y, test_X, test_y, error='accuracy', gridsearch_training_frac=0.7):
- tuned_parameters = {'n_hidden_neurons': [50, 100], 'iterations':[250, 500], 'outputbias': [True]}
- params = tuned_parameters.keys()
- combinations = self.generate_parameter_combinations(tuned_parameters, params)
- split_point = int(gridsearch_training_frac * len(train_X.index))
- train_params_X = train_X.ix[0:split_point,]
- test_params_X = train_X.ix[split_point:len(train_X.index),]
- train_params_y = train_y.ix[0:split_point,]
- test_params_y = train_y.ix[split_point:len(train_X.index),]
-
- if error == 'mse':
- best_error = sys.float_info.max
- elif error == 'accuracy':
- best_error = 0
-
- best_combination = []
- for comb in combinations:
- print comb
- # Order of the keys might have changed.
- keys = tuned_parameters.keys()
- pred_train_y, pred_test_y, pred_train_y_prob, pred_test_y_prob = self.recurrent_neural_network(train_params_X, train_params_y, test_params_X, test_params_y,
- n_hidden_neurons=comb[params.index('n_hidden_neurons')], iterations=comb[params.index('iterations')],
- outputbias=comb[params.index('outputbias')], gridsearch=False)
-
- if error == 'mse':
- eval = RegressionEvaluation()
- mse = eval.mean_squared_error(test_params_y, pred_test_y_prob)
- if mse < best_error:
- best_error = mse
- best_combination = comb
- elif error == 'accuracy':
- eval = ClassificationEvaluation()
- acc = eval.accuracy(test_params_y, pred_test_y)
- if acc > best_error:
- best_error = acc
- best_combination = comb
- print '-------'
- print best_combination
- print '-------'
- return best_combination[params.index('n_hidden_neurons')], best_combination[params.index('iterations')], best_combination[params.index('outputbias')]
-
- # Apply a recurrent neural network for classification upon the training data (with the specified number of
- # hidden neurons and iterations), and use the created network to predict the outcome for both the
- # test and training set. It returns the categorical predictions for the training and test set as well as the
- # probabilities associated with each class, each class being represented as a column in the data frame.
- def recurrent_neural_network(self, train_X, train_y, test_X, test_y, n_hidden_neurons=50, iterations=100, gridsearch=False, gridsearch_training_frac=0.7, outputbias=False, error='accuracy'):
-
- if gridsearch:
- n_hidden_neurons, iterations, outputbias = self.gridsearch_recurrent_neural_network(train_X, train_y, test_X, test_y, gridsearch_training_frac=gridsearch_training_frac, error=error)
- # Create numerical datasets first.
- new_train_X, new_test_X = self.create_numerical_multiple_dataset(train_X, test_X)
- new_train_y, new_test_y = self.create_numerical_multiple_dataset(train_y, test_y)
-
- # We normalize the input.....
- new_train_X, new_test_X, min_X, max_X = self.normalize(new_train_X, new_test_X, 0, 1)
- new_train_y, new_test_y, min_y, max_y = self.normalize(new_train_y, new_test_y, 0.1, 0.9)
-
- # Create the proper pybrain datasets.
- ds_training = self.rnn_dataset(new_train_X, new_train_y)
- ds_test = self.rnn_dataset(new_test_X, new_test_y)
-
- inputs = len(new_train_X.columns)
- outputs = len(new_train_y.columns)
-
- # Build the network with the proper parameters.
- n = buildNetwork(inputs, n_hidden_neurons, outputs, hiddenclass=SigmoidLayer, outclass=SigmoidLayer, outputbias=outputbias, recurrent=True)
-
- # Train using back propagation through time.
- #trainer = BackpropTrainer(n, dataset=ds_training, verbose=False, momentum=0.9, learningrate=0.01)
- trainer = RPropMinusTrainer(n, dataset=ds_training, verbose=False)
-
- for i in range(0, iterations):
- trainer.train()
-
-# for mod in n.modules:
-# for conn in n.connections[mod]:
-# print conn
-# for cc in range(len(conn.params)):
-# print conn.whichBuffers(cc), conn.params[cc]
-
- # Determine performance on the training and test set....
-# Y_train = []
-# for i in range(0, len(new_train_X.index)):
-# input = tuple(new_train_X.ix[i,:].values)
-# output = n.activate(input)
-# Y_train.append(output)
-# Y_test = []
-# for i in range(0, len(new_test_X.index)):
-# Y_test.append(n.activate(tuple(new_test_X.ix[i,:].values)))
-
- Y_train = []
- Y_test = []
-
- for sample, target in ds_training.getSequenceIterator(0):
- Y_train.append(n.activate(sample).tolist())
-
- for sample, target in ds_test.getSequenceIterator(0):
- Y_test.append(n.activate(sample).tolist())
-
- y_train_result = pd.DataFrame(Y_train, columns=new_train_y.columns, index=train_y.index)
- y_test_result = pd.DataFrame(Y_test, columns=new_test_y.columns, index=test_y.index)
-
-# print y_train_result
-
- y_train_result = self.denormalize(y_train_result, min_y, max_y, 0.1, 0.9)
- y_test_result = self.denormalize(y_test_result, min_y, max_y, 0.1, 0.9)
-
-# plot.plot(train_y.index, train_y)
-# plot.hold(True)
-# plot.plot(train_y.index, pred_train_y_prob)
-# plot.show()
-
-
- return y_train_result.idxmax(axis=1), y_test_result.idxmax(axis=1), y_train_result, y_test_result
-
-# The class includes several algorithm that capture the temporal dimension explicitly for
-# classification problems.
-class TemporalRegressionAlgorithms:
-
- # Applies a known dynamical systems model for a regression problem by tuning its parameters towards the data.
- # Hereto, it can use multiple objectives as it uses the nsga 2 algorithm. To be provided are:
- # training set (both input and target)
- # test set (both input and target)
- # a list of columns the model addresses (i.e. the states), the string should be preceded by 'self.' in order for the approach to work.
- # a list of equations to derive the specified states, again using 'self.' preceding all parameters and columns names.
- # a list of targets (a subset of the columns) (again with 'self.')
- # a list of parameters in the equations (again with 'self.')
- # the population size of nsga 2
- # the maximum number of generations for nsga 2
- # whether we want to predict per time point (i.e. we reset the state values of the previous time point to their
- # observed values.
- # It returns a series of predictions for the training and test sets that are the results of parameter setting that are positioned
- # on the Pareto front.
- def dynamical_systems_model_nsga_2(self, train_X, train_y, test_X, test_y, columns, equations, targets, parameters, pop_size=10, max_generations=100, per_time_step=True):
- prng = random.Random()
- evaluator = Evaluator()
- model = Model()
-
- # Create the model.
- model.set_model(columns, equations, parameters)
-
- # Set the desired/known values in our evaluator.
- evaluator.set_values(model, train_X, train_y, test_X, test_y, targets)
-
- # Initialize the NSGA2 algorithm.
- ea = inspyred.ec.emo.NSGA2(prng)
- ea.variator = [inspyred.ec.variators.blend_crossover,
- inspyred.ec.variators.gaussian_mutation]
- ea.terminator = inspyred.ec.terminators.generation_termination
-
- # Let it run.
- final_pop = ea.evolve(generator=evaluator.generator, evaluator=evaluator.evaluator_multi_objective, pop_size=pop_size, maximize=False, bounder=None,max_generations=max_generations)
- final_arc = ea.archive
-
- # For all solutions (they reside on the pareto front)
- return_values = []
- for f in final_arc:
-
- # Predict the results.
- train_fitness, y_train_pred = evaluator.predict(f.candidate, training=True, per_time_step=per_time_step)
- test_fitness, y_test_pred = evaluator.predict(f.candidate, training=False, per_time_step=per_time_step)
-
- # And collect the predictions and fitness values.
- row = [y_train_pred, train_fitness, y_test_pred, test_fitness]
- return_values.append(row)
- return return_values
-
- # Applies a known dynamical systems model for a regression problem by tuning its parameters towards the data using a GA.
- # Hereto, it can use multiple objectives but will just average the values of each one. In the end, one solution
- # will be provided. To be provided are:
- # training set (both input and target)
- # test set (both input and target)
- # a list of columns the model addresses (i.e. the states), the string should be preceded by 'self.' in order for the approach to work.
- # a list of equations to derive the specified states, again using 'self.' preceding all parameters and columns names.
- # a list of targets (a subset of the columns) (again with 'self.')
- # a list of parameters in the equations (again with 'self.')
- # the population size of the GA
- # the maximum number of generations for the GA.
- # whether we want to predict per time point (i.e. we reset the state values of the previous time point to their
- # observed values.
- # It returns a prediction for the training and test sets that are the result of the best parameter setting.
- def dynamical_systems_model_ga(self, train_X, train_y, test_X, test_y, columns, equations, targets, parameters, pop_size=10, max_generations=100, per_time_step=True):
- prng = random.Random()
- evaluator = Evaluator()
- model = Model()
-
- # Create the model.
- model.set_model(columns, equations, parameters)
-
- # Set the desired/known values in our evaluator.
- evaluator.set_values(model, train_X, train_y, test_X, test_y, targets)
- ea = inspyred.ec.GA(prng)
- ea.terminator = inspyred.ec.terminators.generation_termination
-
- # Let it run.
- final_pop = ea.evolve(generator=evaluator.generator, evaluator=evaluator.evaluator_single_objective, pop_size=pop_size, maximize=False, bounder=None,max_generations=max_generations)
-
- # Select the best one and use it to predict.
- best = min(final_pop)
- train_fitness, y_train_pred = evaluator.predict(best.candidate, training=True, per_time_step=per_time_step)
- test_fitness, y_test_pred = evaluator.predict(best.candidate, training=False, per_time_step=per_time_step)
- return y_train_pred, y_test_pred
-
- # Applies a known dynamical systems model for a regression problem by tuning its parameters towards the data using SA.
- # Hereto, it can use multiple objectives but will just average the values of each one. In the end, one solution
- # will be provided. To be provided are:
- # training set (both input and target)
- # test set (both input and target)
- # a list of columns the model addresses (i.e. the states), the string should be preceded by 'self.' in order for the approach to work.
- # a list of equations to derive the specified states, again using 'self.' preceding all parameters and columns names.
- # a list of targets (a subset of the columns) (again with 'self.')
- # a list of parameters in the equations (again with 'self.')
- # the population size for SA
- # the maximum number of generations for the SA.
- # whether we want to predict per time point (i.e. we reset the state values of the previous time point to their
- # observed values.
- # It returns a prediction for the training and test sets that are the result of the best parameter setting.
- def dynamical_systems_model_sa(self, train_X, train_y, test_X, test_y, columns, equations, targets, parameters, pop_size=10, max_generations=100, per_time_step=True):
- prng = random.Random()
- evaluator = Evaluator()
- model = Model()
-
- # Create the model.
- model.set_model(columns, equations, parameters)
-
- # Set the desired/known values in our evaluator.
- evaluator.set_values(model, train_X, train_y, test_X, test_y, targets)
- ea = inspyred.ec.SA(prng)
- ea.terminator = inspyred.ec.terminators.generation_termination
-
- # Let it run.
- final_pop = ea.evolve(generator=evaluator.generator, evaluator=evaluator.evaluator_single_objective, maximize=False, bounder=None, max_generations=max_generations)
-
- # Select the best one and use it to predict.
- best = min(final_pop)
- train_fitness, y_train_pred = evaluator.predict(best.candidate, training=True, per_time_step=per_time_step)
- test_fitness, y_test_pred = evaluator.predict(best.candidate, training=False, per_time_step=per_time_step)
- return y_train_pred, y_test_pred
-
- # Apply an echo state network for regression upon the training data (with the specified reservoir size),
- # and use the created network to predict the outcome for both the
- # test and training set. It returns the predictions for the training and test set.
- def reservoir_computing(self, train_X, train_y, test_X, test_y, reservoir_size=100, a=0.8, per_time_step=False, gridsearch=True, gridsearch_training_frac=0.7):
- # Simply apply the classification variant, but only consider the numerical predictions.
- tc = TemporalClassificationAlgorithms()
- pred_train_y, pred_test_y, pred_train_y_val, pred_test_y_val = tc.reservoir_computing(train_X, train_y, test_X, test_y, reservoir_size=reservoir_size, a=a, per_time_step=per_time_step, gridsearch=gridsearch, gridsearch_training_frac=gridsearch_training_frac, error='mse')
- return pred_train_y_val, pred_test_y_val
-
-
- # Apply a recurrent neural network for regression upon the training data (with the specified number of
- # hidden neurons and iterations), and use the created network to predict the outcome for both the
- # test and training set. It returns the predictions for the training and test set.
- def recurrent_neural_network(self, train_X, train_y, test_X, test_y, n_hidden_neurons=50, iterations=100, gridsearch=False, gridsearch_training_frac=0.7, outputbias=False):
- # Simply apply the classification variant, but only consider the numerical predictions.
- tc = TemporalClassificationAlgorithms()
- pred_train_y, pred_test_y, pred_train_y_val, pred_test_y_val = tc.recurrent_neural_network(train_X, train_y, test_X, test_y, n_hidden_neurons=n_hidden_neurons, iterations=iterations, gridsearch=gridsearch, gridsearch_training_frac=gridsearch_training_frac, outputbias=outputbias, error='mse')
- return pred_train_y_val, pred_test_y_val
-
- # Do a gridsearch for the time series.
- def gridsearch_time_series(self, train_X, train_y, test_X, test_y, error = 'mse', gridsearch_training_frac=0.7):
- tuned_parameters = {'ar': [0, 5], 'ma':[0, 5], 'd':[1]}
- params = tuned_parameters.keys()
-
- tc = TemporalClassificationAlgorithms()
- combinations = tc.generate_parameter_combinations(tuned_parameters, params)
- split_point = int(gridsearch_training_frac * len(train_X.index))
- train_params_X = train_X.ix[0:split_point,]
- test_params_X = train_X.ix[split_point:len(train_X.index),]
- train_params_y = train_y.ix[0:split_point,]
- test_params_y = train_y.ix[split_point:len(train_X.index),]
-
- if error == 'mse':
- best_error = sys.float_info.max
- elif error == 'accuracy':
- best_error = 0
-
- best_combination = []
- for comb in combinations:
- print comb
- # Order of the keys might have changed.
- keys = tuned_parameters.keys()
- pred_train_y, pred_test_y = self.time_series(train_params_X, train_params_y, test_params_X, test_params_y,
- ar=comb[keys.index('ar')], ma=comb[keys.index('ma')],d=comb[keys.index('d')],
- gridsearch=False)
-
- eval = RegressionEvaluation()
- mse = eval.mean_squared_error(test_params_y, pred_test_y)
- if mse < best_error:
- best_error = mse
- best_combination = comb
-
- print '-------'
- print best_combination
- print '-------'
- return best_combination[keys.index('ar')], best_combination[keys.index('ma')], best_combination[keys.index('d')]
-
-
- # Apply a time series ARIMAX approach, and use the created network to predict the outcome for both the
- # test and training set. It returns the predictions for the training and test set. Parameters can be
- # provided around the learning algorithm and a grid search can also be performed.
- def time_series(self, train_X, train_y, test_X, test_y, ar=1, ma=1, d=0, gridsearch=False, gridsearch_training_frac=0.7):
- if gridsearch:
- ar, ma, d = self.gridsearch_time_series(train_X, train_y, test_X, test_y, gridsearch_training_frac=gridsearch_training_frac, error='mse')
-
- train_dataset = copy.deepcopy(train_X)
- formula = train_y.name + '~1+' + "+".join(train_X.columns)
- train_dataset[train_y.name] = train_y
- test_dataset = copy.deepcopy(test_X)
- test_dataset[test_y.name] = test_y
-
- model = pf.ARIMAX(data=train_dataset,formula=formula,ar=ar,ma=ma)
- x = model.fit()
- x.summary()
- model_pred = model.predict(h=len(train_y.index)-max(ar, ma), oos_data=train_dataset)
- values = np.empty((len(model_pred) + max(ar, ma),1))
- values[:] = np.nan
- values[max(ar, ma):] = model_pred.values
- pred_train = pd.DataFrame(values, index=train_y.index, columns=[train_y.name])
- pred_train.ix[max(ar, ma):,:] = model_pred.values
- pred_test = pd.DataFrame(model.predict(h=len(test_y.index), oos_data=test_dataset).values, index=test_y.index, columns=[test_y.name])
-
- return pred_train, pred_test
diff --git a/PythonCode/Chapter8/__init__.py b/PythonCode/Chapter8/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/PythonCode/Chapter8/dynsys/Evaluator.py b/PythonCode/Chapter8/dynsys/Evaluator.py
deleted file mode 100644
index 5591d354..00000000
--- a/PythonCode/Chapter8/dynsys/Evaluator.py
+++ /dev/null
@@ -1,142 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 8 #
-# #
-##############################################################
-
-from inspyred.ec import emo
-import random
-from sklearn.metrics import mean_squared_error
-import copy
-import pandas as pd
-
-# This class evaluates a dynamical systems model.
-class Evaluator():
-
- training_data = {}
- test_data = {}
- model = []
- eval_aspects = []
- cleaned_eval_aspects = []
- default_start = 'self.'
-
- def __init__(self):
- self.training_data = {}
- self.test_data = {}
- self.model = []
- self.eval_aspects = []
- self.cleaned_eval_aspects = []
-
- # This sets the values for the training and test period. We will just
- # consider the eval_aspects (states/columns) for the evaluation.
- def set_values(self, m, train_X, train_y, test_X, test_y, eval_aspects):
-
- # Create a copy of the data
- self.training_data = copy.deepcopy(train_X)
- self.test_data = copy.deepcopy(test_X)
-
- # Add the targets we use from y to the copy of the data to create a single dataset
- # for training and testing.
- for col in eval_aspects:
- self.training_data[col[len(self.default_start):]] = train_y[col[len(self.default_start):]]
- self.test_data[col[len(self.default_start):]] = test_y[col[len(self.default_start):]]
- self.model = m
- self.eval_aspects = eval_aspects
-
- # The eval_aspects array is poluted with the 'self.', we also create a variant without.
- self.cleaned_eval_aspects = []
- for eval in eval_aspects:
- self.cleaned_eval_aspects.append(eval[len(self.default_start):])
-
- # This funtion generates a random initial candidate for our optimization algorithm and returns it.
- def generator(self, random, args):
- numb_parameters = len(self.model.parameter_names)
- return [random.uniform(-1.0, 1.0) for _ in range(numb_parameters)]
-
- # This functions takes a candidate (i.e. a number of parameter settings for our
- # dynamical systems model) and the dataset and evaluates how well it performs.
- # in terms of the mean squared error per eval_aspect. It return the fitness
- # and the prediction. If we have the per_time_step=True we will overwrite the
- # predicted values for the previous time point with the real values.
- def evaluator_internal(self, candidate, dataset, per_time_step=False):
- self.model.reset()
- y = []
- y.append(dataset.ix[0,self.cleaned_eval_aspects].values)
-
- # Go through the dataset, all but last as we need to evaluate our
- # prediction with the next time point.
- for step in range(0, len(dataset.index)-1):
- state_values = []
-
- # Get the relevant values for each of the states
- # in our model.
- for col in self.model.state_names:
- # Overwrite the values we predicted previously for the evaluation states
- # if we do it per time step or if we do not have any prediction yet.
- if per_time_step or (step == 0):
- state_values.append(dataset.ix[step, col[len(self.default_start):]])
-
- # Only overwrite values for the non eval states if we do not do it
- # per time step and use our predicted value for the eval aspects.
- else:
- if col in self.eval_aspects:
- state_values.append(pred_values[self.eval_aspects.index(col)])
- else:
- state_values.append(dataset.ix[step, col[len(self.default_start):]])
-
- # Set the state values, parameter values, and execute the model.
- self.model.set_state_values(state_values)
- self.model.set_parameter_values(candidate)
- self.model.execute_steps(1)
-
- evals = []
- pred_values = []
-
- # Determine the error for the evaluation aspects.
- for eval in self.eval_aspects:
- pred_value = self.model.get_values(eval)[-1]
- pred_values.append(pred_value)
- mse = mean_squared_error([pred_value], [dataset.ix[step+1, col[len(self.default_start):]]])
- evals.append(mse)
-
- # Store the fitness for all aspects.
- fitness = emo.Pareto(evals)
-
- # Store the predicted values.
- y.append(pred_values)
- # And return the fitness and the predicted values.
- y_frame = pd.DataFrame(y, columns=self.cleaned_eval_aspects)
- return fitness, y_frame
-
- # This function evaluates a population of candidates in a multi-objective way.
- # It return the fitness on each eval_aspect for each candidate.
- def evaluator_multi_objective(self, candidates, args):
- fitness_values = []
- for c in candidates:
- fitness, y_pred = self.evaluator_internal(c, self.training_data, per_time_step=True)
- fitness_values.append(fitness)
- return fitness_values
-
- # This function evaluates a population of candidates in a single objective way.
- # It returns a single fitness value per candidate.
- def evaluator_single_objective(self, candidates, args):
- fitness_values = []
- for c in candidates:
- fitness, y_pred = self.evaluator_internal(c, self.training_data, per_time_step=True)
-
- # Sum the fitness values over all aspects.
- fitness_values.append(sum(fitness))
- return fitness_values
-
- # Generate a prediction for a candidate on either the training set (training=True) or the test
- # set. We can again select whether we want to set the values for the previous time point
- # to the true values all the time. We return the fitness van the predicted values.
- def predict(self, candidate, training=True, per_time_step=False):
- if training:
- fitness, y_pred = self.evaluator_internal(candidate, self.training_data, per_time_step=per_time_step)
- else:
- fitness, y_pred = self.evaluator_internal(candidate, self.test_data, per_time_step=per_time_step)
- return fitness, y_pred
\ No newline at end of file
diff --git a/PythonCode/Chapter8/dynsys/Model.py b/PythonCode/Chapter8/dynsys/Model.py
deleted file mode 100644
index 6e18fea9..00000000
--- a/PythonCode/Chapter8/dynsys/Model.py
+++ /dev/null
@@ -1,134 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 8 #
-# #
-##############################################################
-
-import math
-
-# The class represents a dynamical systems model.
-class Model:
-
- state_names = []
- state_values = []
- predicted_values = []
- state_equations = []
- parameter_names = []
- parameter_values = []
- t = 0
- max_value = 10000
-
- def __init__(self):
- self.state_names = []
- self.state_values = []
- self.predicted_values = []
- self.state_equations = []
- self.parameter_names = []
- self.parameter_values = []
- self.t = 0
-
- # This function sets the model in the global variables. It uses the state names, equations
- # in the same order, and the parameter names.
- def set_model(self, state_names, state_equations, parameter_names):
- self.state_names = state_names
- self.state_values.append([])
- self.state_equations = state_equations
- self.parameter_names = parameter_names
-
- # This function resets the model, it empties the predictions for the states and sets
- # the time point to 0.
- def reset(self):
- self.t = 0
- self.state_values = []
- self.state_values.append([])
- self.predicted_values = []
- self.predicted_values.append([])
-
- # This function sets the parameter values in the model.
- def set_parameter_values(self, param_values):
- for p in range(len(self.parameter_names)):
- # We do a bit of magic here, since we do not know the variable
- # names for the parameters up front we execute it in this
- # way. This results in global variables with the proper values.
- exec("%s = %f" % (self.parameter_names[p], param_values[p]))
- self.parameter_values.append(param_values[p])
-
- # This functions sets the state values in the model.
- def set_state_values(self, state_values):
- for s in range(len(self.state_names)):
- # We do a bit of magic here, since we do not know the variable
- # names for the states up front we execute it in this
- # way. This results in global variables with the proper values.
- exec("%s = %f" % (self.state_names[s], state_values[s]))
- self.state_values[self.t].append(state_values[s])
-
- # Some basic printing of the model.
- def print_model(self):
- for e in range(len(self.state_equations)):
- print str(self.state_names[e]) + ' = ',
- print self.state_equations[e]
-
- # Prints the model to a file with the generation of the high level optimization algorithm.
- def print_model_to_file(self, file, generation):
- file.write('======================' + str(generation) + '======================\n')
- for e in range(len(self.state_equations)):
- file.write(str(self.state_names[e]) + ' = ' + str(self.state_equations[e]) + '\n')
-
- # Return the model in a string representation.
- def to_string(self):
- result = ''
- for e in range(len(self.state_equations)):
- result += str(self.state_equations[e])
- return result
-
- # Executes the model for the given number of time steps, given the current
- # settings for the states.
- def execute_steps(self, steps):
-
- # Repeat for the given number of time steps.
- for i in range(0,steps):
-
- # Allocate memory for the state values and the predicted values.
- self.state_values.append([0]*len(self.state_names))
- self.predicted_values.append([0]*len(self.state_names))
- self.t += 1
-
- # Compute the predicted values based on the current values for the states.
- for v in range(len(self.state_names)):
-
- # We compute the value of the state equation.
- value = eval(self.state_equations[v])
-
- # If the number is fishy, we select the maximum value.
- if math.isinf(value) or math.isnan(value):
- value = self.max_value
-
- # And we set the value of the state accordingly.
- exec("%s = %f" % (self.state_names[v], value))
-
- # For debugging.
- self.state_values[self.t][v] = eval(self.state_names[v])
-
- # And we add the prediction for the time point to the file.
- self.predicted_values[self.t][v] = self.state_values[self.t][v]
-
-
- # This function return the values of the specified state from time point 0 to now.
- def get_values(self, state):
-
- # If we do not have any values, we do not do anything.
- if self.t == 0:
- print 'number of values ' + str(self.t)
- values = []
-
- # Get the index of the state
- index = self.state_names.index(state)
-
- # And get the values over all time points.
- for i in range(1, len(self.predicted_values)):
- value = self.predicted_values[i][index]
- values.append(value)
- return values
\ No newline at end of file
diff --git a/PythonCode/Chapter8/dynsys/__init__.py b/PythonCode/Chapter8/dynsys/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/PythonCode/Python2_conda_requirements.txt b/PythonCode/Python2_conda_requirements.txt
deleted file mode 100644
index 8b6ee65f..00000000
--- a/PythonCode/Python2_conda_requirements.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-gensim==0.13.3
-nltk==3.2.1
-unidecode==0.04.20
-statsmodels==0.6.1
-matplotlib==1.5.3
-statsmodels==0.6.1
-scikit-learn==0.18.1
-numpy==1.11.1
-pandas==0.19.2
-scipy==0.18.1
diff --git a/PythonCode/Python2_pip_requirements.txt b/PythonCode/Python2_pip_requirements.txt
deleted file mode 100644
index 438fce9b..00000000
--- a/PythonCode/Python2_pip_requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-pyflux==0.4.14
-pykalman==0.9.5
-pyclust==0.1.15
-inspyred==1.0.1
-pybrain==0.3
-treelib==1.3.5
diff --git a/PythonCode/__init__.py b/PythonCode/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/PythonCode/ch3_visualization.py b/PythonCode/ch3_visualization.py
deleted file mode 100644
index 2ae15252..00000000
--- a/PythonCode/ch3_visualization.py
+++ /dev/null
@@ -1,163 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 3 - Exemplary graphs #
-# #
-##############################################################
-
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plot
-import math
-from scipy.stats import norm
-from Chapter3.DataTransformation import LowPassFilter
-from sklearn.decomposition import PCA
-
-np.random.seed(0)
-
-# Figure 3.1
-
-df = pd.DataFrame(np.arange(0, 1, 0.001), columns=list('X'))
-mean = 0.5
-sd = 0.1
-p = pd.DataFrame(norm.pdf(df,mean,sd), columns=list('p'))
-plot.hold(True)
-plot.plot(df, p)
-plot.xlabel('$X_{1}$')
-plot.ylabel('$P(X_{1})$')
-ax = plot.axes()
-ax.fill_between(df['X'], 0, p['p'], where=df['X']<=0.3, facecolor='red')
-ax.fill_between(df['X'], 0, p['p'], where=df['X']>=0.7, facecolor='red')
-ax.annotate('outliers', xy=(0.3, 0.25), xytext=(0.45, 0.7),
- arrowprops=dict(facecolor='black', shrink=0.05))
-ax.annotate('outliers', xy=(0.7, 0.25), xytext=(0.45, 0.7),
- arrowprops=dict(facecolor='black', shrink=0.05))
-plot.hold(False)
-plot.show()
-
-# Figure 3.2
-
-df = pd.DataFrame(np.random.random_sample(size=(100, 2)), columns=list('XY'))
-plot.plot(df['X'], df['Y'], 'ro')
-plot.xlabel('X$_{1}$')
-plot.ylabel('X$_{2}$')
-plot.hold(True)
-plot.plot([0.5], [0.5], 'ko')
-
-# draw the circle with the arrow
-# http://stackoverflow.com/questions/34823886/plotting-circle-diagram-with-rotary-arrow
-
-radius = 0.2
-angle = 20
-angle_rad = angle * math.pi / 180 # degrees to radians
-# Draw circle
-circle = plot.Circle((0.5,0.5), radius, color='black', fill=False)
-fig = plot.gcf()
-fig.gca().add_artist(circle)
-
-ax = plot.axes()
-ax.arrow(0.5, 0.5,
- (radius - 0.02) * math.cos(angle_rad),
- (radius - 0.02) * math.sin(angle_rad),
- head_width=0.02, head_length=0.02, fc='k', ec='k')
-ax.annotate('$d_{min}$', xy=(.6, .5), xycoords='axes fraction',
- horizontalalignment='center', verticalalignment='center')
-plot.show()
-plot.hold(False)
-
-# Figure 3.3
-
-np.random.seed(0)
-df1 = pd.DataFrame(np.random.randint(10,20,size=(40, 2)), columns=list('XY'))
-df2 = pd.DataFrame(np.random.randint(70,90,size=(5, 2)), columns=list('XY'))
-df1 = df1 / float(100)
-df2 = df2 / float(100)
-plot.hold(True)
-plot.plot(df1['X'], df1['Y'], 'ro')
-plot.plot(0.7, 0.7, 'ro')
-plot.plot(df2['X'], df2['Y'], 'ro')
-plot.plot(0.2, 0.2, 'ro')
-plot.xlabel('X$_{1}$')
-plot.ylabel('X$_{2}$')
-plot.xlim([0,1])
-plot.ylim([0,1])
-plot.plot([0.25], [0.25], 'ko')
-plot.plot([0.65], [0.65], 'ko')
-plot.show()
-
-# Figure 3.4
-
-# Sample frequency (Hz)
-fs = 100
-
-# Create time points....
-t = pd.DataFrame(np.arange(0, 16, float(1)/fs), columns=list('X'))
-c1 = 3 * np.sin(2 * math.pi * 0.1 * t)
-c2 = 2 * np.sin(2 * math.pi * t)
-plot.hold(True)
-plot.plot(t, c1, 'b--')
-plot.plot(t, c2, 'b:')
-plot.plot(t, c1+c2, 'b-')
-LowPass = LowPassFilter()
-new_dataset = LowPass.low_pass_filter(c1+c2, 'X', fs, 0.5, order=3, phase_shift=True)
-plot.plot(t, new_dataset['X_lowpass'], 'r-')
-plot.legend(['$3 \cdot sin(2 \cdot \pi \cdot 0.1 \cdot t))$', '$2 \cdot sin(2 \cdot \pi \cdot t))$', '$combined$', '$combined$ $after$ $filter (f_{c}=0.5Hz, n=3)$'],
- loc=4, fontsize='small')
-plot.xlabel('time')
-plot.ylabel('$X_{1}$')
-plot.show()
-
-# Figure 3.5
-
-df = pd.DataFrame(np.arange(0, 1, 0.1), columns=list('X'))
-df['Y'] = pd.DataFrame(np.random.normal(0, 0.1, size=(10,1)), columns=list('Y'))
-df['Y'] = df['Y'] + df['X']
-
-pca = PCA(n_components=2, svd_solver='full')
-pca.fit(df)
-first_component = pca.components_[0]
-second_component = pca.components_[1]
-
-factor_1 = first_component[0]/first_component[1]
-factor_2 = second_component[0]/second_component[1]
-
-plot.hold(True)
-plot.plot(df['X'], df['Y'], 'ro')
-plot.plot(df['X'], df['X']*factor_1, 'r-')
-plot.plot(df['X'], df['X']*factor_2+0.5, 'b-')
-plot.legend(['$data$', '$first$ $component$', '$second$ $component$'], loc=2)
-plot.xlim([0,1])
-plot.ylim([0,1])
-plot.xlabel('$X_{1}$')
-plot.ylabel('$X_{2}$')
-plot.hold(False)
-plot.show()
-
-# Figure 3.6
-
-transformed_dataset = np.inner(first_component, df)
-plot.hold(True)
-plot.plot(transformed_dataset, [0]*transformed_dataset.shape[0], 'ro')
-plot.ylim([-0.05,1])
-plot.xlabel('$X\'_{1}$')
-ax = plot.axes()
-ax.get_yaxis().set_visible(False)
-ax.spines['right'].set_visible(False)
-ax.spines['left'].set_visible(False)
-ax.spines['bottom'].set_position('zero')
-plot.legend(['$transformed$ $data$'], loc=(0.5, 0.1))
-plot.hold(False)
-plot.show()
-
-# Figure 3.7
-
-transformed_dataset = np.inner(pca.components_, df)
-plot.hold(True)
-plot.plot(transformed_dataset[0], transformed_dataset[1], 'ro')
-plot.xlabel('$X\'_{1}$')
-plot.ylabel('$X\'_{2}$')
-plot.legend(['$transformed$ $data$'], loc=4)
-plot.hold(False)
-plot.show()
diff --git a/PythonCode/ch4_visualization.py b/PythonCode/ch4_visualization.py
deleted file mode 100644
index 00bc3db4..00000000
--- a/PythonCode/ch4_visualization.py
+++ /dev/null
@@ -1,62 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 4 - Exemplary graphs #
-# #
-##############################################################
-
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plot
-import math
-import copy
-from scipy.stats import norm
-from sklearn.decomposition import PCA
-from Chapter4.FrequencyAbstraction import FourierTransformation
-import re
-
-np.random.seed(0)
-
-# Figure 4.1
-
-# Sample frequency (Hz)
-fs = 10
-
-# Create time points....
-df = pd.DataFrame(np.arange(0, 16.1, float(1)/fs), columns=list('X'))
-c1 = 3 * np.sin(2 * math.pi * 0.2 * df['X'])
-c2 = 2 * np.sin(2 * math.pi * 0.25 * (df['X']-2)) + 5
-df['Y'] = c1 + c2
-
-plot.hold(True)
-plot.plot(df['X'], df['Y'], 'b-')
-plot.legend(['$example$ $measurement$ $sequence$'], loc=3, fontsize='small')
-plot.xlabel('time')
-plot.ylabel('$X_{1}$')
-plot.show()
-
-# Figure 4.2
-
-FreqAbs = FourierTransformation()
-data_table = FreqAbs.abstract_frequency(copy.deepcopy(df), ['Y'], 160, fs)
-# Get the frequencies from the columns....
-frequencies = []
-values = []
-for col in data_table.columns:
- val = re.findall(r'freq_\d+\.\d+_Hz', col)
- if len(val) > 0:
- frequency = float((val[0])[5:len(val)-4])
- frequencies.append(frequency)
- values.append(data_table.ix[data_table.index, col])
-
-fig = plot.figure()
-plot.hold(True)
-ax1 = fig.add_subplot(111)
-plot.xlim([0, 5])
-ax1.plot(frequencies, values, 'b+')
-ax1.set_xlabel('Frequency (Hz)')
-ax1.set_ylabel('$a$')
-plot.show()
-
diff --git a/PythonCode/ch5_visualization.py b/PythonCode/ch5_visualization.py
deleted file mode 100644
index d9b4301f..00000000
--- a/PythonCode/ch5_visualization.py
+++ /dev/null
@@ -1,343 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 5 - Exemplary graphs #
-# #
-##############################################################
-
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plot
-import math
-import copy
-from scipy.stats import norm
-from sklearn.decomposition import PCA
-from Chapter4.FrequencyAbstraction import FourierTransformation
-from matplotlib.patches import Rectangle
-import re
-import sklearn
-
-np.random.seed(0)
-
-# Figure 5.1
-
-points_x = [0.25, 0.75]
-points_y = [0.25, 0.75]
-
-plot.hold(True)
-plot.plot(points_x, points_y, 'ro')
-manhattan_x = [points_x[0], points_x[0], points_x[1]]
-manhattan_y = [points_y[0], points_y[1], points_y[1]]
-euclidean_x = [points_x[0], points_x[1]]
-euclidean_y = [points_y[0], points_y[1]]
-
-plot.plot(manhattan_x, manhattan_y, 'b-')
-plot.plot(euclidean_x, euclidean_y, 'r:')
-
-plot.legend(['$measurements$','$manhattan$ $distance$', '$euclidean$ $distance$'], loc=4, fontsize='small')
-plot.xlabel('$X_{1}$')
-plot.ylabel('$X_{2}$')
-plot.xlim([0,1])
-plot.ylim([0,1])
-plot.hold(False)
-plot.show()
-
-# Figure 5.2 (complicated figure....)
-
-df = pd.DataFrame(np.arange(0, 1, 0.001), columns=list('X'))
-mean = 0.5
-sd = 0.1
-p = pd.DataFrame(norm.pdf(df,mean,sd), columns=list('p'))
-mean2 = 0.6
-sd2 = 0.2
-p2 = pd.DataFrame(norm.pdf(df,mean2,sd2), columns=list('p'))
-plot.hold(True)
-
-f, axarr = plot.subplots(7, 3)
-f.subplots_adjust(hspace=0.8)
-
-axarr[0, 0].axes.set_axis_off()
-axarr[0, 0].set_xlim([0,1])
-axarr[0, 0].set_ylim([0,1])
-axarr[0, 0].text(0, 0.65, '1', fontsize=12,
- bbox={'facecolor':'grey', 'alpha':0.5, 'pad':10})
-axarr[0, 0].text(0.2, 0.8, '$x_{1,qs_{i}}^{1},\dots,x_{N_{qs_{i}},qs_{i}}^{1}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[0, 0].plot([0.4, 0.4], [0.55, 0.75], 'k:')
-axarr[0, 0].text(0.2, 0.2, '$x_{1,qs_{i}}^{p},\dots,x_{N_{qs_{i}},qs_{i}}^{p}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[0, 0].arrow(0.75, 0.65, 0.15, 0, head_width=0.05, head_length=0.05, fc='k', ec='k')
-axarr[0, 1].axes.set_axis_off()
-axarr[0, 1].set_xlim([0,1])
-axarr[0, 1].set_ylim([0,1])
-axarr[0, 1].text(0.2, 0.8, '$x\_mean_{qs_{i}}^{1}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[0, 1].plot([0.4, 0.4], [0.55, 0.75], 'k:')
-axarr[0, 1].text(0.2, 0.2, '$x\_mean_{qs_{i}}^{p}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[0, 2].arrow(0, 0.7, 0.15, 0, head_width=0.05, head_length=0.05, fc='k', ec='k')
-axarr[0, 2].axes.set_axis_off()
-axarr[0, 2].set_xlim([0,1])
-axarr[0, 2].set_ylim([0,1])
-axarr[0, 2].text(0.3, 0.55, '$cluster$ $on$ $mean$ $values$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=12)
-axarr[1, 0].axes.set_axis_off()
-axarr[1, 1].plot(df['X'], p, 'b-')
-axarr[1, 1].xaxis.set_ticklabels([])
-axarr[1, 1].yaxis.set_ticklabels([])
-axarr[1, 1].set_xlabel('$X_{1}$')
-axarr[1, 1].set_ylabel('$P(X_{1})$')
-axarr[1, 2].axes.set_axis_off()
-axarr[2, 0].axes.set_axis_off()
-axarr[2, 0].set_xlim([0,1])
-axarr[2, 0].set_ylim([0,1])
-axarr[2, 0].text(0, 0.65, '2', fontsize=12,
- bbox={'facecolor':'grey', 'alpha':0.5, 'pad':10})
-axarr[2, 0].text(0.2, 0.8, '$x_{1,qs_{i}}^{1},\dots,x_{N_{qs_{i}},qs_{i}}^{1}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[2, 0].plot([0.4, 0.4], [0.55, 0.75], 'k:')
-axarr[2, 0].text(0.2, 0.2, '$x_{1,qs_{i}}^{p},\dots,x_{N_{qs_{i}},qs_{i}}^{p}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[2, 0].arrow(0.75, 0.65, 0.15, 0, head_width=0.05, head_length=0.05, fc='k', ec='k')
-axarr[2, 1].axes.set_axis_off()
-axarr[2, 1].set_xlim([0,1])
-axarr[2, 1].set_ylim([0,1])
-axarr[2, 1].plot([0.5,0.5], [0,1], 'k:')
-axarr[2, 2].axes.set_axis_off()
-axarr[2, 2].set_xlim([0,1])
-axarr[2, 2].set_ylim([0,1])
-axarr[2, 2].arrow(0, 0.7, 0.15, 0, head_width=0.05, head_length=0.05, fc='k', ec='k')
-axarr[2, 2].text(0.3, 0.55, '$cluster$ $on$ $distribution$ $parameters$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=12)
-axarr[3, 0].axes.set_axis_off()
-axarr[3, 0].set_xlim([0,1])
-axarr[3, 0].set_ylim([0,1])
-axarr[3, 1].plot(df['X'], p, 'b-')
-axarr[3, 1].xaxis.set_ticklabels([])
-axarr[3, 1].yaxis.set_ticklabels([])
-axarr[3, 1].set_xlabel('$X_{p}$')
-axarr[3, 1].set_ylabel('$P(X_{p})$')
-axarr[3, 2].axes.set_axis_off()
-
-axarr[4, 0].axes.set_axis_off()
-axarr[4, 0].set_xlim([0,1])
-axarr[4, 0].set_ylim([0,1])
-axarr[4, 0].text(0, 0.65, '3', fontsize=12,
- bbox={'facecolor':'grey', 'alpha':0.5, 'pad':10})
-axarr[4, 0].text(0.2, 0.8, '$x_{1,qs_{i}}^{1},\dots,x_{N_{qs_{i}},qs_{i}}^{1}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[4, 0].plot([0.4, 0.4], [0.55, 0.75], 'k:')
-axarr[4, 0].text(0.2, 0.2, '$x_{1,qs_{i}}^{p},\dots,x_{N_{qs_{i}},qs_{i}}^{p}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[4, 0].plot([0.4, 0.4], [-0.2, 0.2], 'k:')
-axarr[4, 1].plot(df['X'], p, 'b-')
-axarr[4, 1].plot(df['X'], p2, 'r-')
-axarr[4, 1].legend(['$i$', '$j$'], loc=2, fontsize='xx-small')
-axarr[4, 1].xaxis.set_ticklabels([])
-axarr[4, 1].yaxis.set_ticklabels([])
-axarr[4, 1].set_xlabel('$X_{1}$')
-axarr[4, 1].set_ylabel('$P(X_{1})$')
-axarr[4, 2].axes.set_axis_off()
-axarr[5, 0].axes.set_axis_off()
-axarr[5, 0].set_xlim([0,1])
-axarr[5, 0].set_ylim([0,1])
-axarr[5, 0].text(0.2, 0.8, '$x_{1,qs_{j}}^{1},\dots,x_{N_{qs_{j}},qs_{j}}^{1}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[5, 0].plot([0.4, 0.4], [0.55, 0.75], 'k:')
-axarr[5, 0].text(0.2, 0.2, '$x_{1,qs_{j}}^{p},\dots,x_{N_{qs_{j}},qs_{j}}^{p}$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-axarr[5, 0].arrow(0.75, 0.65, 0.15, 0, head_width=0.05, head_length=0.05, fc='k', ec='k')
-axarr[5, 1].axes.set_axis_off()
-axarr[5, 1].set_xlim([0,1])
-axarr[5, 1].set_ylim([0,1])
-axarr[5, 1].plot([0.5,0.5], [0,1], 'k:')
-axarr[5, 2].axes.set_axis_off()
-axarr[5, 2].set_xlim([0,1])
-axarr[5, 2].set_ylim([0,1])
-axarr[5, 2].arrow(0, 0.7, 0.15, 0, head_width=0.05, head_length=0.05, fc='k', ec='k')
-axarr[5, 2].text(0.3, 0.55, '$cluster$ $on$ $p$ $values$ $between$ $i$ $and$ $j$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=12)
-axarr[6, 0].axes.set_axis_off()
-axarr[6, 0].set_xlim([0,1])
-axarr[6, 0].set_ylim([0,1])
-axarr[6, 1].plot(df['X'], p, 'b-')
-axarr[6, 1].plot(df['X'], p2, 'r-')
-axarr[6, 1].legend(['$i$', '$j$'], loc=2, fontsize='xx-small')
-axarr[6, 1].xaxis.set_ticklabels([])
-axarr[6, 1].yaxis.set_ticklabels([])
-axarr[6, 1].set_xlabel('$X_{p}$')
-axarr[6, 1].set_ylabel('$P(X_{p})$')
-axarr[6, 2].axes.set_axis_off()
-plot.hold(False)
-plot.show()
-
-# Figure 5.3
-
-time = np.array([1,2,3,4,5,6,7])
-y_arnold = np.array([0.2,0.2,0.5,0.2,0.2,0.2,0.2])
-y_eric = np.array([0.18,0.18,0.18,0.34,0.5,0.34,0.18])
-plot.hold(True)
-plot.plot(time, y_arnold, 'b-o')
-plot.plot(time, y_eric, 'r:*')
-
-plot.legend(['$Arnold$','$Eric$'], loc=1, fontsize='small')
-plot.xlabel('time')
-plot.ylabel('$X_{1}$')
-plot.ylim([0,1])
-plot.hold(False)
-plot.show()
-
-# Figure 5.4
-
-f, axarr = plot.subplots(2, 2)
-f.subplots_adjust(hspace=0)
-f.subplots_adjust(wspace=0)
-plot.hold(True)
-axarr[0, 0].axes.set_axis_off()
-axarr[0, 0].set_xlim([0,max(1-y_arnold)+0.05])
-axarr[0, 0].set_ylim([1,8])
-axarr[0, 0].plot(1-y_arnold, time+0.5, 'b-o')
-axarr[1, 0].axes.set_axis_off()
-axarr[0, 1].xaxis.set_ticklabels([])
-axarr[0, 1].xaxis.set_ticks(time)
-axarr[0, 1].set_xlim([1,8])
-axarr[0, 1].yaxis.set_ticklabels([])
-axarr[0, 1].yaxis.set_ticks(time)
-for t in time:
- axarr[0, 1].plot([t,t], [min(time), max(time)+1], 'k:')
-for t in time:
- axarr[0, 1].plot([min(time), max(time)+1], [t,t], 'k:')
-axarr[0, 1].add_patch(Rectangle((1, 1), 1, 1,alpha=1))
-axarr[0, 1].add_patch(Rectangle((2, 2), 1, 1,alpha=1))
-axarr[0, 1].add_patch(Rectangle((3, 2), 1, 1,alpha=1))
-axarr[0, 1].add_patch(Rectangle((4, 3), 1, 1,alpha=1))
-axarr[0, 1].add_patch(Rectangle((5, 3), 1, 1,alpha=1))
-axarr[0, 1].add_patch(Rectangle((6, 3), 1, 1,alpha=1))
-axarr[0, 1].add_patch(Rectangle((7, 4), 1, 1,alpha=1))
-axarr[0, 1].add_patch(Rectangle((7, 5), 1, 1,alpha=1))
-axarr[0, 1].add_patch(Rectangle((7, 6), 1, 1,alpha=1))
-axarr[0, 1].add_patch(Rectangle((7, 7), 1, 1,alpha=1))
-
-axarr[0, 1].set_ylim([1,8])
-axarr[1, 1].axes.set_axis_off()
-axarr[1, 1].set_xlim([1,8])
-axarr[1, 1].set_ylim([0,max(1-y_eric)+0.05])
-axarr[1, 1].plot(time+0.5, 1-y_eric, 'r-*')
-plot.hold(False)
-plot.show()
-
-# Figure 5.5
-
-np.random.seed(0)
-f, axarr = plot.subplots(2, 2)
-f.subplots_adjust(hspace=0.4)
-f.subplots_adjust(wspace=0.4)
-plot.hold(True)
-
-# Generate random data points.
-numbers = np.vstack([np.random.randint(10,20,size=(10, 2)),np.random.randint(70,90,size=(10, 2))])
-df = pd.DataFrame(numbers, columns=list('XY'))
-centers = pd.DataFrame(np.vstack([[0.25, 0.35], [0.02, 0.02]]), columns=list('XY'))
-df = df / float(100)
-# Set the initial random centers to values such that we have a nice example.
-axarr[0, 0].plot(centers['X'], centers['Y'], 'ko')
-axarr[0, 0].plot(df['X'], df['Y'], 'ro')
-axarr[0, 0].legend(['$centers$', '$data$ $points$'], loc=4, fontsize='small', numpoints=1)
-axarr[0, 0].set_xlim([0,1])
-axarr[0, 0].set_ylim([0,1])
-axarr[0, 0].set_xlabel('$X_{1}$')
-axarr[0, 0].set_ylabel('$X_{2}$')
-axarr[0, 0].set_title('$step$ $1:$ $random$ $centers$')
-
-# Determine the cluster for each of the data points
-
-cluster = np.argmin(sklearn.metrics.pairwise.euclidean_distances(X=df, Y=centers), axis=1)
-df['cluster'] = cluster
-
-axarr[0, 1].plot(df[df['cluster']==0]['X'], df[df['cluster']==0]['Y'], 'ro')
-axarr[0, 1].plot(df[df['cluster']==1]['X'], df[df['cluster']==1]['Y'], 'bo')
-axarr[0, 1].plot(centers['X'], centers['Y'], 'ko')
-axarr[0, 1].legend(['$cluster$ $1$', '$cluster$ $2$', '$centers$'], loc=4, fontsize='small', numpoints=1)
-axarr[0, 1].set_xlim([0,1])
-axarr[0, 1].set_ylim([0,1])
-axarr[0, 1].set_xlabel('$X_{1}$')
-axarr[0, 1].set_ylabel('$X_{2}$')
-axarr[0, 1].set_title('$step$ $2:$ $cluster$ $assignment$')
-
-# Update the centers
-
-centers.ix[0,:] = df[df['cluster']==0].mean(axis=0)[['X','Y']]
-centers.ix[1,:] = df[df['cluster']==1].mean(axis=0)[['X','Y']]
-
-axarr[1, 0].plot(df[df['cluster']==0]['X'], df[df['cluster']==0]['Y'], 'ro')
-axarr[1, 0].plot(df[df['cluster']==1]['X'], df[df['cluster']==1]['Y'], 'bo')
-axarr[1, 0].plot(centers['X'], centers['Y'], 'ko')
-axarr[1, 0].legend(['$cluster$ $1$', '$cluster$ $2$', '$centers$'], loc=4, fontsize='small', numpoints=1)
-axarr[1, 0].set_xlim([0,1])
-axarr[1, 0].set_ylim([0,1])
-axarr[1, 0].set_xlabel('$X_{1}$')
-axarr[1, 0].set_ylabel('$X_{2}$')
-axarr[1, 0].set_title('$step$ $3:$ $update$ $centers$')
-
-# And determine the cluster for each of the data points:
-
-cluster = np.argmin(sklearn.metrics.pairwise.euclidean_distances(X=df[['X', 'Y']], Y=centers), axis=1)
-df['cluster'] = cluster
-
-axarr[1, 1].plot(df[df['cluster']==0]['X'], df[df['cluster']==0]['Y'], 'ro')
-axarr[1, 1].plot(df[df['cluster']==1]['X'], df[df['cluster']==1]['Y'], 'bo')
-axarr[1, 1].plot(centers['X'], centers['Y'], 'ko')
-axarr[1, 1].legend(['$cluster$ $1$', '$cluster$ $2$', '$centers$'], loc=4, fontsize='small', numpoints=1)
-axarr[1, 1].set_xlim([0,1])
-axarr[1, 1].set_ylim([0,1])
-axarr[1, 1].set_xlabel('$X_{1}$')
-axarr[1, 1].set_ylabel('$X_{2}$')
-axarr[1, 1].set_title('$step$ $4:$ $cluster$ $assignment$')
-
-plot.hold(False)
-plot.show()
-
-# Figure 5.7
-
-np.random.seed(0)
-numbers = np.vstack([np.random.randint(0,20,size=(20, 2)),
- np.random.randint(60,100,size=(30, 2)),
- np.random.randint(40,60,size=(20, 2))])
-numbers = pd.DataFrame(numbers, columns=list('XY'))
-numbers = numbers / float(100)
-print numbers
-plot.hold(True)
-values = np.arange(0,1,0.2)
-plot.xlim([0,1])
-plot.ylim([0,1])
-plot.plot(numbers['X'], numbers['Y'], 'ro')
-plot.legend(['$data$ $points$'], loc=4, fontsize='small', numpoints=1)
-for v in values:
- plot.plot([v,v], [min(values), max(values)+1], 'k:')
-for v in values:
- plot.plot([min(values), max(values)+1], [v,v], 'k:')
- ax = plot.gca()
-
-ax.add_patch(Rectangle((0.0, 0), 0.2, 0.2,alpha=0.5, color='grey'))
-ax.add_patch(Rectangle((0.4, 0.4), 0.2, 0.2,alpha=0.5, color='grey'))
-ax.add_patch(Rectangle((0.6, 0.6), 0.4, 0.4,alpha=0.5, color='grey'))
-plot.xlabel('$X_{1}$')
-plot.ylabel('$X_{2}$')
-
-plot.hold(False)
-plot.show()
diff --git a/PythonCode/ch6_visualization.py b/PythonCode/ch6_visualization.py
deleted file mode 100644
index 38108d5d..00000000
--- a/PythonCode/ch6_visualization.py
+++ /dev/null
@@ -1,94 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 6 - Exemplary graphs #
-# #
-##############################################################
-
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plot
-import math
-import copy
-from scipy.stats import norm
-from sklearn.decomposition import PCA
-from Chapter4.FrequencyAbstraction import FourierTransformation
-from matplotlib.patches import Rectangle
-import re
-import sklearn
-from mpl_toolkits.mplot3d import Axes3D
-from matplotlib import cm
-
-
-np.random.seed(0)
-
-# Figure 6.1
-
-df = pd.DataFrame(columns=['x', 'y'])
-x = np.random.normal(0, 0.5, 100)
-df['x'] = x
-y = 2.5 * x + 3
-df['y'] = y
-
-a = np.arange(0, 5, 0.1)
-b = np.arange(0, 5, 0.1)
-X, Y = np.meshgrid(a, b)
-
-result = np.empty((0,3))
-for i in b:
- for j in a:
- y_calc = x * i + j
- error = sklearn.metrics.mean_squared_error(y, y_calc)
- result = np.vstack([result, [i, j, error]])
-
-X, Y = np.meshgrid(a, b)
-e_df = pd.DataFrame(result, columns=['b', 'a', 'error'])
-
-Z = e_df['error'].values.reshape(len(X),len(Y))
-
-fig = plot.figure()
-plot.hold(True)
-ax = fig.gca(projection='3d')
-surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='brg_r')
-#ax.scatter(e_df['a'], e_df['b'], e_df['error'])
-ax.set_xlabel('$\\theta_{1}$')
-ax.set_ylabel('$\\theta_{2}$')
-ax.set_zlabel('$E_{in}(h)$')
-fig.colorbar(surf, shrink=0.5, aspect=5)
-print e_df
-
-plot.hold(False)
-plot.show()
-
-# Figure 6.2
-
-plot.hold(True)
-V, U = np.gradient(Z, .2, .2)
-Q = plot.quiver(X, Y, -U, -V, pivot='mid', units='inches')
-plot.xlabel('$\\theta_{1}$')
-plot.ylabel('$\\theta_{2}$')
-plot.hold(False)
-plot.show()
-
-plot.hold(True)
-p = plot.contour(X, Y, Z,cmap='brg_r')
-plot.clabel(p, fontsize=9, inline=1)
-current_value = np.array([0,0])
-x_values = [0]
-y_values = [0]
-V, U = np.gradient(Z, .1, .1)
-steps = 1000
-for i in range(0, steps):
- current_value = current_value - [0.1*V[current_value[0]/0.1, current_value[1]/0.1], 0.1*U[current_value[0]/0.1, current_value[1]/0.1]]
- x_values.append(current_value[0])
- y_values.append(current_value[1])
-plot.plot(x_values, y_values, 'k:')
-plot.gca().arrow(x_values[-1]-0.1, y_values[-1], +0.0001, 0, head_width=0.1, head_length=0.1, fc='k', ec='k')
-plot.xlabel('$\\theta_{1}$')
-plot.ylabel('$\\theta_{2}$')
-plot.legend(['$gradient$ $descent$ $path$'], loc=1, fontsize='small')
-
-plot.hold(False)
-plot.show()
\ No newline at end of file
diff --git a/PythonCode/ch7_visualization.py b/PythonCode/ch7_visualization.py
deleted file mode 100644
index a140ec78..00000000
--- a/PythonCode/ch7_visualization.py
+++ /dev/null
@@ -1,210 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 7 - Exemplary graphs #
-# #
-##############################################################
-
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plot
-import math
-import copy
-from scipy.stats import norm
-from sklearn.decomposition import PCA
-from Chapter4.FrequencyAbstraction import FourierTransformation
-from matplotlib.patches import Rectangle
-import re
-import sklearn
-import random
-from mpl_toolkits.mplot3d import Axes3D
-
-np.random.seed(0)
-
-
-# Figure 7.2
-
-plot.hold(True)
-df1 = pd.DataFrame(np.random.normal(30,5,size=(50, 2)), columns=list('XY'))
-df2 = pd.DataFrame(np.random.normal(70,5,size=(50, 2)), columns=list('XY'))
-df1 = df1 / float(100)
-df2 = df2 / float(100)
-# Set the initial random centers to values such that we have a nice example.
-plot.plot(df1['X'], df1['Y'], 'ro')
-plot.plot(df2['X'], df2['Y'], 'bo')
-plot.plot([0,1],[1,0],'k:')
-plot.legend(['$class$ $1$ $(active)$', '$class$ $2$ $(inactive)$', '$decision$ $boundary$'], loc=4, fontsize='small', numpoints=1)
-plot.xlim([0,1])
-plot.ylim([0,1])
-plot.xlabel('$X_{1}$')
-plot.ylabel('$X_{2}$')
-plot.hold(False)
-plot.show()
-
-# Figure 7.3
-
-plot.hold(True)
-df1 = pd.DataFrame(np.vstack([np.hstack([np.random.normal(50,5,size=(50, 1)), np.random.normal(80,5,size=(50, 1))]),
- np.hstack([np.random.normal(50,5,size=(50, 1)), np.random.normal(20,5,size=(50, 1))])]),
- columns=list('XY'))
-df1 = df1 / float(100)
-df2 = pd.DataFrame(np.vstack([np.hstack([np.random.normal(20,5,size=(50, 1)), np.random.normal(50,5,size=(50, 1))]),
- np.hstack([np.random.normal(80,5,size=(50, 1)), np.random.normal(50,5,size=(50, 1))])]),
- columns=list('XY'))
-df2 = df2 / float(100)
-# Set the initial random centers to values such that we have a nice example.
-plot.plot(df1['X'], df1['Y'], 'ro')
-plot.plot(df2['X'], df2['Y'], 'bo')
-x = np.arange(0,1.1,0.1)
-y1 = x
-y2 = 1-x
-plot.plot(x,y1,'k:')
-plot.plot(x,y2,'k:')
-ax = plot.axes()
-ax.fill_between(x, y1, y2, where=y1<=y2, facecolor='grey', linewidth=0.0)
-ax.fill_between(x, y2, y1, where=y1>=y2, facecolor='grey', linewidth=0.0)
-ax.annotate('$P_{1}$', xy=(0.2, 0.2), xytext=(0.3, 0.1),
- arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=3.5, headlength=3), fontsize=15)
-ax.annotate('$P_{2}$', xy=(0.2, 0.8), xytext=(0.3, 0.9),
- arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=3.5, headlength=3), fontsize=15)
-
-plot.legend(['$class$ $1$ $(active)$', '$class$ $2$ $(inactive)$', '$decision$ $boundary$'], loc=4, fontsize='small', numpoints=1)
-plot.xlim([0,1])
-plot.ylim([0,1])
-plot.xlabel('$X_{1}$')
-plot.ylabel('$X_{2}$')
-plot.hold(False)
-plot.show()
-
-# Figure 7.6
-plot.hold(True)
-df1 = pd.DataFrame(np.random.normal(30,5,size=(50, 2)), columns=list('XY'))
-df2 = pd.DataFrame(np.random.normal(70,5,size=(50, 2)), columns=list('XY'))
-df1 = df1 / float(100)
-df2 = df2 / float(100)
-
-# Given the way we have generate out data, the line
-# y = 1-x is the best separating line. This can be
-# written as -x-y+1 = 0
-
-# Let us computer the distance of our data points to this line...
-result1 = ((-1 * df1['X']) + (-1 * df1['Y']) + 1).abs() / float(math.sqrt(2))
-df1['dist'] = result1
-index_closest_point_1 = df1['dist'].idxmin(axis=0)
-result2 = ((-1 * df2['X']) + (-1 * df2['Y']) + 1).abs() / float(math.sqrt(2))
-df2['dist'] = result2
-index_closest_point_2 = df2['dist'].idxmin(axis=0)
-
-# And draw the two lines that go through this point:
-b_1 = df1.ix[index_closest_point_1, 'X'] + df1.ix[index_closest_point_1, 'Y']
-b_2 = df2.ix[index_closest_point_2, 'X'] + df2.ix[index_closest_point_2, 'Y']
-x = np.arange(0,1.1, 0.1)
-y_1 = -1 * x + b_1
-y_2 = -1 * x + b_2
-y = (y_1 + y_2) / 2
-
-
-# Set the initial random centers to values such that we have a nice example.
-plot.plot(df1['X'], df1['Y'], 'ro')
-plot.plot(df2['X'], df2['Y'], 'bo')
-plot.plot(x,y,'k-')
-plot.plot(x,y_1,'k:')
-plot.plot(x,y_2,'k:')
-plot.legend(['$class$ $1$ $(active)$', '$class$ $2$ $(inactive)$', '$decision$ $boundary$'], loc=2, fontsize='small', numpoints=1)
-
-plot.xlim([0,1])
-plot.ylim([0,1])
-plot.xlabel('$X_{1}$')
-plot.ylabel('$X_{2}$')
-ax = plot.gca()
-ax.annotate('$w^{T}x + b=1$', xy=(x[9], y_2[9]), xytext=(x[9] - 0.08, y_2[9] + 0.08),
- arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=3.5, headlength=3), fontsize=15)
-ax.annotate('$w^{T}x + b=0$', xy=(x[8], y[8]), xytext=(x[8] - 0.08, y[8] + 0.08),
- arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=3.5, headlength=3), fontsize=15)
-ax.annotate('$w^{T}x + b=-1$', xy=(x[7], y_1[7]), xytext=(x[7] - 0.08, y_1[7] + 0.08),
- arrowprops=dict(facecolor='black', shrink=0.05, width=1, headwidth=3.5, headlength=3), fontsize=15)
-ax.annotate ('', (x[2], y_1[2]), (x[4]-0.02, y_2[4]+0.02), arrowprops={'arrowstyle':'<->'})
-ax.text(0.25, 0.6, '$2/|W|$',
- verticalalignment='bottom', horizontalalignment='left',
- color='black', fontsize=10)
-plot.hold(False)
-plot.show()
-
-# Figure 7.7
-
-
-x = np.arange(0, 1.05, 0.05)
-
-# Radius of one cicle is 0.1, the other circle 0.3
-
-x_1 = []
-x_2 = []
-y_1 = []
-y_2 = []
-for i in range(0,1000):
- angle = random.uniform(0,1)*(math.pi*2)
- x_1.append(math.cos(angle)/3 + 0.5 + random.gauss(0, 0.025))
- x_2.append(math.cos(angle)/10 + 0.5 + random.gauss(0, 0.025))
- y_1.append(math.sin(angle)/3 + 0.5 + random.gauss(0, 0.025))
- y_2.append(math.sin(angle)/10 + 0.5 + random.gauss(0, 0.025))
-
-fig = plot.figure(figsize=plot.figaspect(2.))
-ax = fig.add_subplot(1, 2, 1)
-
-plot.hold(True)
-ax.plot(x_1,y_1,'ro')
-ax.plot(x_2,y_2,'bo')
-ax.legend(['$class$ $1$ $(other$ $activity)$', '$class$ $2$ $(walking)$'], loc=2, fontsize='small', numpoints=1)
-ax.set_xlim([0,1])
-ax.set_ylim([0,1])
-ax.set_xlabel('$X_{1}$')
-ax.set_ylabel('$X_{2}$')
-
-df1 = pd.DataFrame(columns=list('XY'))
-df1['X'] = x_1
-df1['Y'] = y_1
-df2 = pd.DataFrame(columns=list('XY'))
-df2['X'] = x_2
-df2['Y'] = y_2
-sigma = 1
-z_1 = np.power(math.e, -(sklearn.metrics.pairwise.euclidean_distances(X=df1, Y=np.array([0.5, 0.5]))/2 * math.pow(sigma, 2)))
-z_2 = np.power(math.e, -(sklearn.metrics.pairwise.euclidean_distances(X=df2, Y=np.array([0.5, 0.5]))/2 * math.pow(sigma, 2)))
-ax = fig.add_subplot(1, 2, 2, projection='3d')
-
-ax.scatter(x_1, y_1, z_1, color='r', marker='o')
-ax.scatter(x_2, y_2, z_2, color='b', marker='o')
-ax.set_xlabel('$X_{1}$')
-ax.set_ylabel('$X_{2}$')
-ax.set_zlabel('$e^{||x-x''||^{2}/2\cdot\sigma^{2}}$')
-ax.legend(['$class$ $1$ $(other$ $activity)$', '$class$ $2$ $(walking)$'], loc=2, fontsize='small', numpoints=1)
-
-plot.hold(False)
-plot.show()
-
-# Figure 7.8
-
-plot.hold(True)
-df1 = pd.DataFrame(np.random.normal(30,5,size=(50, 2)), columns=list('XY'))
-df2 = pd.DataFrame(np.random.normal(70,5,size=(50, 2)), columns=list('XY'))
-df1 = df1 / float(100)
-df2 = df2 / float(100)
-# Set the initial random centers to values such that we have a nice example.
-plot.plot(df1['X'], df1['Y'], 'ro')
-plot.plot(df2['X'], df2['Y'], 'bo')
-plot.plot([0.51],[0.51],'ko')
-k = 3
-df_full = pd.concat([df1, df2], ignore_index=True)
-distances_df_full = sklearn.metrics.pairwise.euclidean_distances(X=df_full, Y=np.array([0.51, 0.51])).flatten()
-ind = np.argsort(distances_df_full)[:k]
-plot.plot(df_full.loc[ind, 'X'], df_full.loc[ind, 'Y'] ,'y*', markersize=12)
-
-
-plot.legend(['$class$ $1$ $(active)$', '$class$ $2$ $(inactive)$', '$new$ $point$', '$nearest$ $neighbors$'], loc=4, fontsize='small', numpoints=1)
-plot.xlim([0,1])
-plot.ylim([0,1])
-plot.xlabel('$X_{1}$')
-plot.ylabel('$X_{2}$')
-plot.hold(False)
-plot.show()
diff --git a/PythonCode/ch8_visualization.py b/PythonCode/ch8_visualization.py
deleted file mode 100644
index 74be40c1..00000000
--- a/PythonCode/ch8_visualization.py
+++ /dev/null
@@ -1,399 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 8 - Exemplary graphs #
-# #
-##############################################################
-
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plot
-import matplotlib.dates as md
-import math
-import copy
-from scipy.stats import norm
-from sklearn.decomposition import PCA
-from Chapter4.FrequencyAbstraction import FourierTransformation
-from matplotlib.patches import Rectangle
-import re
-import sklearn
-import random
-from mpl_toolkits.mplot3d import Axes3D
-from pandas.tools.plotting import autocorrelation_plot
-from statsmodels.tsa.stattools import pacf
-from statsmodels.tsa.arima_model import ARIMA
-import pyflux as pf
-from statsmodels.tsa.seasonal import seasonal_decompose
-import statsmodels
-
-
-
-
-
-np.random.seed(0)
-
-
-# Figure 8.1
-
-plot.hold(True)
-time = np.arange(1,15,1)
-
-pred_mood = [0.5]
-pred_al = [0.4]
-x = [1]
-delta_t = 1
-gamma_1 = 5
-gamma_2 = 0.8
-gamma_3 = 0.25
-gamma_4 = 1
-gamma_5 = 1
-
-for t in range(1, len(time)):
- pred_mood.append(pred_mood[-1] + x[-1]*(gamma_1*(1-pred_mood[-1])*max((pred_al[-1] - pred_mood[-1]), 0) + gamma_2*pred_mood[-1]*(min((pred_al[-1] - pred_mood[-1]), 0)))*delta_t)
- pred_al.append(pred_al[-1] + gamma_3 * (pred_al[-1] * min(math.sin(((t-(gamma_4*math.pi))/gamma_5)), 0) + (1-pred_al[-1])* max(math.sin(((t-(gamma_4*math.pi))/gamma_5)), 0)))
- x.append(x[-1]);
-
-values_mood = pred_mood
-activity_level = pred_al
-
-plot.plot(time, pred_mood, 'ro-')
-plot.plot(time, pred_al, 'bo:')
-plot.ylim([0,1])
-plot.xlabel('time')
-plot.ylabel('value')
-plot.legend(['$mood$', '$activity$ $level$'], loc=4, fontsize='small', numpoints=1)
-plot.hold(False)
-plot.show()
-
-# Figure 8.2
-
-plot.hold(True)
-f, axarr = plot.subplots(3, 2)
-f.subplots_adjust(hspace=0.5)
-f.subplots_adjust(wspace=0.5)
-random.seed(0)
-random_time_series = pd.DataFrame(np.random.normal(0.1, 2, 5200), index=range(0, 5200), columns=['value'])
-axarr[0,0].plot(random_time_series.index, random_time_series['value'])
-axarr[0,0].set_xlim([101, 500])
-axarr[0,0].set_ylim([-10, 10])
-axarr[0,0].set_xlabel('time')
-axarr[0,0].set_ylabel('value')
-
-autocorrelation_plot(random_time_series['value'], ax=axarr[0, 1])
-axarr[0, 1].set_xlim([0, 30])
-axarr[0, 1].set_ylim([-1.1, 1.1])
-
-rolling_window_data = pd.rolling_mean(random_time_series['value'], 10)
-print rolling_window_data
-axarr[1,0].plot(random_time_series.index, rolling_window_data)
-axarr[1,0].set_xlim([101, 500])
-axarr[1,0].set_ylim([-10, 10])
-axarr[1,0].set_xlabel('time')
-axarr[1,0].set_ylabel('value')
-
-autocorrelation_plot(rolling_window_data[10:], ax=axarr[1, 1])
-axarr[1, 1].set_xlim([0, 30])
-axarr[1, 1].set_ylim([-1.1, 1.1])
-
-cumsum_data = random_time_series.cumsum(axis=0)
-print rolling_window_data
-axarr[2,0].plot(random_time_series.index, cumsum_data)
-axarr[2,0].set_xlim([0, 5000])
-axarr[2,0].set_xlabel('time')
-axarr[2,0].set_ylabel('value')
-
-autocorrelation_plot(cumsum_data, ax=axarr[2, 1])
-axarr[2, 1].set_xlim([0, 30])
-axarr[2, 1].set_ylim([-1.1, 1.1])
-plot.hold(False)
-plot.show()
-
-# Figure 8.3
-
-plot.hold(True)
-plot.plot(random_time_series.index, cumsum_data, 'b-')
-plot.plot(random_time_series.index, cumsum_data.ewm(alpha=0.2).mean(), 'k:')
-plot.plot(random_time_series.index, cumsum_data.ewm(alpha=0.05).mean(), 'r:')
-plot.xlim([0, 1000])
-plot.ylim([-50, 100])
-plot.xlabel('time')
-plot.ylabel('value')
-plot.legend(['$original$ $series$', '$\\alpha=0.2$', '$\\alpha=0.05$'], fontsize='small')
-plot.hold(False)
-plot.show()
-
-# Figure 8.4
-
-plot.hold(True)
-random_time_series = pd.DataFrame(np.random.normal(0, 1, 5200), index=range(0, 5200), columns=['value'])
-random_time_series['value'] = random_time_series['value'] + 5
-random_time_series.ix[1000:3999, 'value'] = random_time_series.ix[1000:3999, 'value'] + 20
-linear_deduction = np.arange(0, -20, -(float(20)/2000))
-random_time_series.ix[2000:3999, 'value'] = random_time_series.ix[2000:3999, 'value'] + linear_deduction
-
-plot.plot(random_time_series.index, random_time_series['value'], 'b-')
-plot.plot(random_time_series.index, random_time_series['value'].ewm(alpha=0.05).mean(), 'r:')
-plot.plot(random_time_series.index, random_time_series['value'].diff(periods=1), 'k-')
-
-plot.xlim([0, 5200])
-plot.xlabel('time')
-plot.ylabel('value')
-plot.legend(['$original$ $series$', '$trend$ $using $ $\\alpha=0.05$', '$detrended$'], fontsize='small')
-plot.hold(False)
-plot.show()
-
-# Figure 8.5
-
-plot.hold(True)
-dataset_path = './intermediate_datafiles/'
-dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0)
-dataset.index = dataset.index.to_datetime()
-
-f, axarr = plot.subplots(1, 2)
-f.subplots_adjust(hspace=0.5)
-f.subplots_adjust(wspace=0.5)
-xfmt = md.DateFormatter('%H:%M')
-
-axarr[0].plot(dataset.index, dataset['acc_phone_x'], 'b-')
-axarr[0].set_xlabel('time')
-axarr[0].set_ylabel('value')
-axarr[0].xaxis.set_major_formatter(xfmt)
-axarr[1].plot(dataset.index, dataset['acc_phone_x'].diff(periods=1), 'k-')
-axarr[1].set_xlabel('time')
-axarr[1].set_ylabel('value')
-axarr[1].xaxis.set_major_formatter(xfmt)
-plot.hold(False)
-plot.show()
-
-# Figure 8.6
-
-plot.hold(True)
-dataset_path = './intermediate_datafiles/'
-dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0)
-dataset.index = dataset.index.to_datetime()
-
-xfmt = md.DateFormatter('%H:%M')
-
-plot.plot(dataset.index, dataset['acc_phone_x'], color='0.75')
-plot.xlabel('time')
-plot.ylabel('value')
-plot.gca().xaxis.set_major_formatter(xfmt)
-dataset['filtered_acc_x'] = dataset['acc_phone_x'].rolling(400).mean().shift(-200)
-dataset['filtered_acc_y'] = dataset['acc_phone_y'].rolling(400).mean().shift(-200)
-dataset['filtered_acc_z'] = dataset['acc_phone_z'].rolling(400).mean().shift(-200)
-plot.plot(dataset.index, dataset['filtered_acc_x'], 'b-')
-dataset['radius'] = dataset['filtered_acc_x'].pow(2) + dataset['filtered_acc_y'].pow(2) + dataset['filtered_acc_z'].pow(2)
-dataset['radius'] = dataset['radius'].pow(0.5)
-plot.plot(dataset.index, dataset['radius'], 'r-')
-plot.legend(['$original$ $series$', '$filtered$', '$||a||(filtered)$'], fontsize='small')
-plot.hold(False)
-plot.show()
-
-# Figure 8.7
-
-f, axarr = plot.subplots(1, 2)
-plot.hold(True)
-dataset_path = '../datasets/crowdsignals.io/csv-participant-one/'
-dataset = pd.read_csv(dataset_path + 'accelerometer_phone.csv', index_col=0)
-dataset.index = pd.to_datetime(dataset['timestamps']).values
-del dataset['timestamps']
-dataset = dataset.ix[400000:404000, 'x']
-dataset = dataset.resample('10L').mean()
-
-
-temp_ts = dataset
-xfmt = md.DateFormatter('%H:%M')
-
-autocorrelation_plot(temp_ts, ax=axarr[0])
-axarr[0].set_xlim([0,1000])
-axarr[0].set_ylim([-1,1])
-
-pacf_x, confint = pacf(temp_ts, nlags=100, alpha=.05)
-df = pd.DataFrame(confint, columns=['lower', 'upper'])
-df['lower'] = df['lower'] - np.array(pacf_x)
-df['upper'] = df['upper'] - np.array(pacf_x)
-print pacf_x.shape
-axarr[1].plot(range(0, 101), pacf_x, 'b-')
-axarr[1].plot(range(1, 101), df.ix[1:,'lower'], color='0.5')
-axarr[1].plot(range(0, 101), [0]*101, color='0')
-axarr[1].plot(range(1, 101), df.ix[1:,'upper'], color='0.5')
-axarr[1].grid()
-axarr[1].set_ylim([-1,1])
-axarr[1].set_xlabel('Lag')
-axarr[1].set_ylabel('Partial Autocorrelation')
-plot.hold(False)
-plot.show()
-
-# Figure 8.8
-
-plot.hold(True)
-
-model = ARIMA(temp_ts[0:500], order=(3,1,2))
-results_AR = model.fit(disp=-1)
-
-df = pd.DataFrame(temp_ts[0:500], index=temp_ts.index[0:500], columns=['x'])
-model = pf.ARIMA(df, ar=3, ma=2)
-x = model.fit()
-xfmt = md.DateFormatter('%H:%M:%S')
-plot.gca().xaxis.set_major_formatter(xfmt)
-plot.plot(temp_ts.index[200:500], temp_ts[200:500], 'b')
-pred = model.predict_is(h=300)
-plot.plot(pred.index,pred, 'r:')
-#plot.plot(temp_ts.index[200:400], temp_ts[200:400] - pred['Series'], 'k:')
-plot.legend(['$original$ $series$', '$predicted$', '$difference$'], fontsize='small')
-plot.xlabel('time')
-plot.ylabel('value')
-
-plot.hold(False)
-plot.show()
-
-#Figure 8.9
-
-plot.hold(True)
-xfmt = md.DateFormatter('%H:%M:%S')
-plot.gca().xaxis.set_major_formatter(xfmt)
-df = pd.DataFrame(temp_ts[0:400], index=temp_ts.index[0:400], columns=['x'])
-
-model = pf.ARIMA(df, ar=3, ma=2)
-x = model.fit()
-predictions = model.predict(h=100, intervals=True)
-print df
-print predictions
-plot.plot(temp_ts.index[400:500], predictions['x'], 'r:')
-y1 = predictions['5% Prediction Interval']
-y2 = predictions['95% Prediction Interval']
-plot.fill_between(temp_ts.index[400:500], y1, y2, where=y2 >= y1, facecolor='grey', interpolate=True)
-plot.plot(temp_ts.index[200:500], temp_ts[200:500], 'b')
-
-plot.legend(['$predicted$', '$original$ $series$'], fontsize='small')
-plot.xlabel('time')
-plot.ylabel('value')
-plot.show()
-
-#Figure 8.10
-
-plot.hold(True)
-xfmt = md.DateFormatter('%H:%M:%S')
-plot.gca().xaxis.set_major_formatter(xfmt)
-df = pd.DataFrame(temp_ts[0:500], index=temp_ts.index[0:500], columns=['x'])
-decomposition = seasonal_decompose(np.array(df['x'].values),freq=107)
-
-seasonal = decomposition.seasonal
-residual = df['x'] - decomposition.seasonal
-
-f, axarr = plot.subplots(3, 1)
-f.subplots_adjust(hspace=0.5)
-f.subplots_adjust(wspace=0.5)
-axarr[0].plot(temp_ts.index[0:500], temp_ts[0:500], 'b')
-axarr[0].legend(['$original$ $data$'], fontsize='small')
-axarr[0].xaxis.set_major_formatter(xfmt)
-axarr[0].set_ylabel('value')
-axarr[1].plot(temp_ts.index[0:500], seasonal, 'b:')
-axarr[1].legend(['$seasonality$'], fontsize='small')
-axarr[1].xaxis.set_major_formatter(xfmt)
-axarr[1].set_ylabel('value')
-axarr[2].plot(temp_ts.index[0:500], residual, 'k:')
-axarr[2].legend(['$residual$'], fontsize='small')
-axarr[2].xaxis.set_major_formatter(xfmt)
-axarr[2].set_ylabel('value')
-axarr[2].set_xlabel('time')
-
-plot.hold(False)
-plot.show()
-
-#Figure 8.11
-
-plot.hold(True)
-df = pd.DataFrame(residual[0:480], index=temp_ts.index[0:480], columns=['x'])
-model = pf.ARIMA(df, ar=3, ma=2)
-x = model.fit()
-predictions = model.predict(h=100, intervals=True)
-plot.plot(temp_ts.index[400:500], predictions['x'] + seasonal[400:500], 'r:')
-y1 = predictions['5% Prediction Interval'] + seasonal[400:500]
-y2 = predictions['95% Prediction Interval'] + seasonal[400:500]
-plot.fill_between(temp_ts.index[400:500], y1, y2, where=y2 >= y1, facecolor='grey', interpolate=True)
-plot.plot(temp_ts.index[200:500], temp_ts[200:500], 'b')
-
-plot.legend(['$predicted$', '$original$ $series$'], fontsize='small')
-plot.xlabel('time')
-plot.ylabel('value')
-plot.gca().xaxis.set_major_formatter(xfmt)
-plot.hold(False)
-plot.show()
-
-
-# Figure 8.16
-
-
-plot.hold(True)
-f, axarr = plot.subplots(1, 2)
-
-pred_mood = [0.5]
-pred_al = [0.4]
-x = [1]
-delta_t = 1
-gamma_1 = gamma_2 = gamma_3 = gamma_4 = gamma_5 = 1
-
-for t in range(1, len(time)):
- pred_mood.append(pred_mood[-1] + x[-1]*(gamma_1*(1-pred_mood[-1])*max((pred_al[-1] - pred_mood[-1]), 0) + gamma_2*pred_mood[-1]*(min((pred_al[-1] - pred_mood[-1]), 0)))*delta_t)
- pred_al.append(pred_al[-1] + gamma_3 * (pred_al[-1] * min(math.sin(((t-(gamma_4*math.pi))/gamma_5)), 0) + (1-pred_al[-1])* max(math.sin(((t-(gamma_4*math.pi))/gamma_5)), 0)))
- x.append(x[-1]);
-
-axarr[0].plot(time, values_mood, 'ro')
-axarr[0].plot(time, activity_level, 'bo')
-axarr[0].plot(time, pred_mood, 'r-')
-axarr[0].plot(time, pred_al, 'b:')
-
-axarr[0].set_ylim([0,1])
-axarr[0].set_xlabel('time')
-axarr[0].set_ylabel('value')
-axarr[0].legend(['$mood$', '$activity$ $level$', '$predicted$ $mood$ $with$ $\gamma_{1}=\gamma_{2}=\gamma_{3}=\gamma_{4}=\gamma_{5}=1$',
- '$predicted$ $mood$ $with$ $\gamma_{1}=\gamma_{2}=\gamma_{3}=\gamma_{4}=\gamma_{5}=1$'], loc=4, fontsize='small', numpoints=1)
-
-pred_mood = [0.5]
-pred_al = [0.4]
-x = [1]
-delta_t = 1
-gamma_1 = 5
-gamma_2 = 0.75
-gamma_3 = 0.3
-gamma_4 = 1
-gamma_5 = 1
-
-for t in range(1, len(time)):
- pred_mood.append(pred_mood[-1] + x[-1]*(gamma_1*(1-pred_mood[-1])*max((pred_al[-1] - pred_mood[-1]), 0) + gamma_2*pred_mood[-1]*(min((pred_al[-1] - pred_mood[-1]), 0)))*delta_t)
- pred_al.append(pred_al[-1] + gamma_3 * (pred_al[-1] * min(math.sin(((t-(gamma_4*math.pi))/gamma_5)), 0) + (1-pred_al[-1])* max(math.sin(((t-(gamma_4*math.pi))/gamma_5)), 0)))
- x.append(x[-1]);
-
-axarr[1].plot(time, values_mood, 'ro')
-axarr[1].plot(time, activity_level, 'bo')
-axarr[1].plot(time, pred_mood, 'r-')
-axarr[1].plot(time, pred_al, 'b:')
-
-axarr[1].set_ylim([0,1])
-axarr[1].set_xlabel('time')
-axarr[1].set_ylabel('value')
-axarr[1].legend(['$mood$', '$activity$ $level$', '$predicted$ $mood$ $with$ $\gamma_{1}=5, \gamma_{2}=0.75, \gamma_{3}=0.3, \gamma_{4}=\gamma_{5}=1$',
- '$predicted$ $mood$ $with$ $\gamma_{1}=5, \gamma_{2}=0.75, \gamma_{3}=0.3, \gamma_{4}=\gamma_{5}=1$'], loc=4, fontsize='small', numpoints=1)
-
-
-plot.hold(False)
-plot.show()
-
-# Figure 8.19
-
-plot.hold(True)
-x = np.arange(0.05, 1.01, 0.01)
-y = 0.05/x
-plot.plot(x, y, 'r-')
-plot.plot([x[1], x[6], x[20], x[40], x[80]], [y[1], y[6], y[20], y[40], y[80]], 'ro')
-plot.plot([0.4, 0.6], [0.6, 0.4], 'bo')
-plot.legend(['$pareto$ $front$', '$non-dominated$ $instance$','$dominated$ $instance$'], loc=1, fontsize='small', numpoints=1)
-plot.xlim([0,1])
-plot.ylim([0,1])
-plot.xlabel('$E_{X_{1}}$')
-plot.ylabel('$E{X_{2}}$')
-plot.hold(False)
-plot.show()
diff --git a/PythonCode/crowdsignals_ch2.py b/PythonCode/crowdsignals_ch2.py
deleted file mode 100644
index 8f8f0d10..00000000
--- a/PythonCode/crowdsignals_ch2.py
+++ /dev/null
@@ -1,96 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 2 #
-# #
-##############################################################
-
-
-dataset_path = '../datasets/crowdsignals.io/csv-participant-one/'
-result_dataset_path = './intermediate_datafiles/'
-
-# Import the relevant classes.
-
-from Chapter2.CreateDataset import CreateDataset
-from util.VisualizeDataset import VisualizeDataset
-from util import util
-import copy
-import os
-
-
-if not os.path.exists(result_dataset_path):
- print('Creating result directory: ' + result_dataset_path)
- os.makedirs(result_dataset_path)
-
-# Chapter 2: Initial exploration of the dataset.
-
-# Set a granularity (i.e. how big are our discrete time steps). We start very
-# coarse grained, namely one measurement per minute, and secondly use four measurements
-# per second
-
-granularities = [60000, 250]
-datasets = []
-
-for milliseconds_per_instance in granularities:
-
- # Create an initial dataset object with the base directory for our data and a granularity
- DataSet = CreateDataset(dataset_path, milliseconds_per_instance)
-
- # Add the selected measurements to it.
-
- # We add the accelerometer data (continuous numerical measurements) of the phone and the smartwatch
- # and aggregate the values per timestep by averaging the values/
- DataSet.add_numerical_dataset('accelerometer_phone.csv', 'timestamps', ['x','y','z'], 'avg', 'acc_phone_')
- DataSet.add_numerical_dataset('accelerometer_smartwatch.csv', 'timestamps', ['x','y','z'], 'avg', 'acc_watch_')
-
- # We add the gyroscope data (continuous numerical measurements) of the phone and the smartwatch
- # and aggregate the values per timestep by averaging the values/
- DataSet.add_numerical_dataset('gyroscope_phone.csv', 'timestamps', ['x','y','z'], 'avg', 'gyr_phone_')
- DataSet.add_numerical_dataset('gyroscope_smartwatch.csv', 'timestamps', ['x','y','z'], 'avg', 'gyr_watch_')
-
- # We add the heart rate (continuous numerical measurements) and aggregate by averaging again
- DataSet.add_numerical_dataset('heart_rate_smartwatch.csv', 'timestamps', ['rate'], 'avg', 'hr_watch_')
-
- # We add the labels provided by the users. These are categorical events that might overlap. We add them
- # as binary attributes (i.e. add a one to the attribute representing the specific value for the label if it
- # occurs within an interval).
- DataSet.add_event_dataset('labels.csv', 'label_start', 'label_end', 'label', 'binary')
-
- # We add the amount of light sensed by the phone (continuous numerical measurements) and aggregate by averaging again
- DataSet.add_numerical_dataset('light_phone.csv', 'timestamps', ['lux'], 'avg', 'light_phone_')
-
- # We add the magnetometer data (continuous numerical measurements) of the phone and the smartwatch
- # and aggregate the values per timestep by averaging the values
- DataSet.add_numerical_dataset('magnetometer_phone.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_phone_')
- DataSet.add_numerical_dataset('magnetometer_smartwatch.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_watch_')
-
- # We add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
- DataSet.add_numerical_dataset('pressure_phone.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_')
-
- # Get the resulting pandas data table
-
- dataset = DataSet.data_table
-
- # Plot the data
-
- DataViz = VisualizeDataset()
-
- # Boxplot
- DataViz.plot_dataset_boxplot(dataset, ['acc_phone_x','acc_phone_y','acc_phone_z','acc_watch_x','acc_watch_y','acc_watch_z'])
-
- # Plot all data
- DataViz.plot_dataset(dataset, ['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like', 'like','like'], ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])
-
- # And print a summary of the dataset
-
- util.print_statistics(dataset)
- datasets.append(copy.deepcopy(dataset))
-
-# And print the table that has been included in the book
-
-util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1])
-
-# Finally, store the last dataset we have generated (250 ms).
-dataset.to_csv(result_dataset_path + 'chapter2_result.csv')
diff --git a/PythonCode/crowdsignals_ch3_outliers.py b/PythonCode/crowdsignals_ch3_outliers.py
deleted file mode 100644
index e3f97fed..00000000
--- a/PythonCode/crowdsignals_ch3_outliers.py
+++ /dev/null
@@ -1,81 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 3 #
-# #
-##############################################################
-
-from util.VisualizeDataset import VisualizeDataset
-from Chapter3.OutlierDetection import DistributionBasedOutlierDetection
-from Chapter3.OutlierDetection import DistanceBasedOutlierDetection
-import copy
-import pandas as pd
-import numpy as np
-
-# Let is create our visualization class again.
-DataViz = VisualizeDataset()
-
-# Read the result from the previous chapter, and make sture the index is of the type datetime.
-dataset_path = './intermediate_datafiles/'
-try:
- dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0)
-except IOError as e:
- print('File not found, try to run previous crowdsignals scripts first!')
- raise e
-
-dataset.index = dataset.index.to_datetime()
-
-# Compute the number of milliseconds covered by an instance based on the first two rows
-milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000
-
-# Step 1: Let us see whether we have some outliers we would prefer to remove.
-
-# Determine the columns we want to experiment on.
-outlier_columns = ['acc_phone_x', 'light_phone_lux']
-
-# Create the outlier classes.
-OutlierDistr = DistributionBasedOutlierDetection()
-OutlierDist = DistanceBasedOutlierDetection()
-
-#And investigate the approaches for all relevant attributes.
-for col in outlier_columns:
- # And try out all different approaches. Note that we have done some optimization
- # of the parameter values for each of the approaches by visual inspection.
- dataset = OutlierDistr.chauvenet(dataset, col)
- DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
- dataset = OutlierDistr.mixture_model(dataset, col)
- DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
- # This requires:
- # n_data_points * n_data_points * point_size =
- # 31839 * 31839 * 64 bits = ~8GB available memory
- try:
- dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99)
- DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
- except MemoryError as e:
- print('Not enough memory available for simple distance-based outlier detection...')
- print('Skipping.')
-
- try:
- dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
- DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])
- except MemoryError as e:
- print('Not enough memory available for lof...')
- print('Skipping.')
-
- # Remove all the stuff from the dataset again.
- cols_to_remove = [col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof']
- for to_remove in cols_to_remove:
- if to_remove in dataset:
- del dataset[to_remove]
-
-# We take Chauvent's criterion and apply it to all but the label data...
-
-for col in [c for c in dataset.columns if not 'label' in c]:
- print 'Measurement is now: ' , col
- dataset = OutlierDistr.chauvenet(dataset, col)
- dataset.loc[dataset[col + '_outlier'] == True, col] = np.nan
- del dataset[col + '_outlier']
-
-dataset.to_csv(dataset_path + 'chapter3_result_outliers.csv')
diff --git a/PythonCode/crowdsignals_ch3_rest.py b/PythonCode/crowdsignals_ch3_rest.py
deleted file mode 100644
index 4364d80f..00000000
--- a/PythonCode/crowdsignals_ch3_rest.py
+++ /dev/null
@@ -1,108 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 3 #
-# #
-##############################################################
-
-from util.VisualizeDataset import VisualizeDataset
-from Chapter3.DataTransformation import LowPassFilter
-from Chapter3.DataTransformation import PrincipalComponentAnalysis
-from Chapter3.ImputationMissingValues import ImputationMissingValues
-from Chapter3.KalmanFilters import KalmanFilters
-import copy
-import pandas as pd
-import numpy as np
-import matplotlib.pyplot as plot
-
-# Let is create our visualization class again.
-DataViz = VisualizeDataset()
-
-# Read the result from the previous chapter, and make sure the index is of the type datetime.
-dataset_path = './intermediate_datafiles/'
-dataset = pd.read_csv(dataset_path + 'chapter3_result_outliers.csv', index_col=0)
-dataset.index = dataset.index.to_datetime()
-
-# Computer the number of milliseconds covered by an instane based on the first two rows
-milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000
-
-# Step 2: Let us impute the missing values.
-
-MisVal = ImputationMissingValues()
-imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate')
-imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate')
-imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_watch_rate')
-DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate'])
-
-# And we impute for all columns except for the label in the selected way (interpolation)
-
-for col in [c for c in dataset.columns if not 'label' in c]:
- dataset = MisVal.impute_interpolate(dataset, col)
-
-# Let us try the Kalman filter on the light_phone_lux attribute and study the result.
-
-original_dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0)
-original_dataset.index = original_dataset.index.to_datetime()
-KalFilter = KalmanFilters()
-kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'acc_phone_x')
-DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'], 'acc_phone_x', kalman_dataset['acc_phone_x_kalman'])
-DataViz.plot_dataset(kalman_dataset, ['acc_phone_x', 'acc_phone_x_kalman'], ['exact','exact'], ['line', 'line'])
-
-# We ignore the Kalman filter output for now...
-
-# Let us apply a lowpass filter and reduce the importance of the data above 1.5 Hz
-
-LowPass = LowPassFilter()
-
-# Determine the sampling frequency.
-fs = float(1000)/milliseconds_per_instance
-cutoff = 1.5
-
-# Let us study acc_phone_x:
-new_dataset = LowPass.low_pass_filter(copy.deepcopy(dataset), 'acc_phone_x', fs, cutoff, order=10)
-DataViz.plot_dataset(new_dataset.ix[int(0.4*len(new_dataset.index)):int(0.43*len(new_dataset.index)), :], ['acc_phone_x', 'acc_phone_x_lowpass'], ['exact','exact'], ['line', 'line'])
-
-# And not let us include all measurements that have a form of periodicity (and filter them):
-periodic_measurements = ['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
- 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x',
- 'mag_watch_y', 'mag_watch_z']
-
-for col in periodic_measurements:
- dataset = LowPass.low_pass_filter(dataset, col, fs, cutoff, order=10)
- dataset[col] = dataset[col + '_lowpass']
- del dataset[col + '_lowpass']
-
-
-# Determine the PC's for all but our target columns (the labels and the heart rate)
-# We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder.
-
-PCA = PrincipalComponentAnalysis()
-selected_predictor_cols = [c for c in dataset.columns if (not ('label' in c)) and (not (c == 'hr_watch_rate'))]
-pc_values = PCA.determine_pc_explained_variance(dataset, selected_predictor_cols)
-
-# Plot the variance explained.
-
-plot.plot(range(1, len(selected_predictor_cols)+1), pc_values, 'b-')
-plot.xlabel('principal component number')
-plot.ylabel('explained variance')
-plot.show(block=False)
-
-# We select 7 as the best number of PC's as this explains most of the variance
-
-n_pcs = 7
-
-dataset = PCA.apply_pca(copy.deepcopy(dataset), selected_predictor_cols, n_pcs)
-
-#And we visualize the result of the PC's
-
-DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'], ['line', 'points'])
-
-# And the overall final dataset:
-
-DataViz.plot_dataset(dataset, ['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'pca_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like', 'like','like', 'like'], ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])
-
-# Store the outcome.
-
-dataset.to_csv(dataset_path + 'chapter3_result_final.csv')
\ No newline at end of file
diff --git a/PythonCode/crowdsignals_ch4.py b/PythonCode/crowdsignals_ch4.py
deleted file mode 100644
index 70376165..00000000
--- a/PythonCode/crowdsignals_ch4.py
+++ /dev/null
@@ -1,85 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 4 #
-# #
-##############################################################
-
-from util.VisualizeDataset import VisualizeDataset
-from Chapter4.TemporalAbstraction import NumericalAbstraction
-from Chapter4.TemporalAbstraction import CategoricalAbstraction
-from Chapter4.FrequencyAbstraction import FourierTransformation
-from Chapter4.TextAbstraction import TextAbstraction
-import copy
-import pandas as pd
-
-# Let us create our visualization class again.
-DataViz = VisualizeDataset()
-
-# Read the result from the previous chapter, and make sure the index is of the type datetime.
-dataset_path = './intermediate_datafiles/'
-try:
- dataset = pd.read_csv(dataset_path + 'chapter3_result_final.csv', index_col=0)
-except IOError as e:
- print('File not found, try to run previous crowdsignals scripts first!')
- raise e
-
-dataset.index = dataset.index.to_datetime()
-
-# Compute the number of milliseconds covered by an instane based on the first two rows
-milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000
-
-
-# Chapter 4: Identifying aggregate attributes.
-
-# First we focus on the time domain.
-
-# Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
-window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)]
-
-NumAbs = NumericalAbstraction()
-dataset_copy = copy.deepcopy(dataset)
-for ws in window_sizes:
- dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'mean')
- dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'std')
-
-DataViz.plot_dataset(dataset_copy, ['acc_phone_x', 'acc_phone_x_temp_mean', 'acc_phone_x_temp_std', 'label'], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])
-
-ws = int(float(0.5*60000)/milliseconds_per_instance)
-selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
-dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean')
-dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std')
-
-
-CatAbs = CategoricalAbstraction()
-dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03, int(float(5*60000)/milliseconds_per_instance), 2)
-
-# Now we move to the frequency domain, with the same window size.
-
-FreqAbs = FourierTransformation()
-fs = float(1000)/milliseconds_per_instance
-
-periodic_predictor_cols = ['acc_phone_x','acc_phone_y','acc_phone_z','acc_watch_x','acc_watch_y','acc_watch_z','gyr_phone_x','gyr_phone_y',
- 'gyr_phone_z','gyr_watch_x','gyr_watch_y','gyr_watch_z','mag_phone_x','mag_phone_y','mag_phone_z',
- 'mag_watch_x','mag_watch_y','mag_watch_z']
-data_table = FreqAbs.abstract_frequency(copy.deepcopy(dataset), ['acc_phone_x'], int(float(10000)/milliseconds_per_instance), fs)
-
-# Spectral analysis.
-
-DataViz.plot_dataset(data_table, ['acc_phone_x_max_freq', 'acc_phone_x_freq_weighted', 'acc_phone_x_pse', 'label'], ['like', 'like', 'like', 'like'], ['line', 'line', 'line','points'])
-
-dataset = FreqAbs.abstract_frequency(dataset, periodic_predictor_cols, int(float(10000)/milliseconds_per_instance), fs)
-
-# Now we only take a certain percentage of overlap in the windows, otherwise our training examples will be too much alike.
-
-# The percentage of overlap we allow
-window_overlap = 0.9
-skip_points = int((1-window_overlap) * ws)
-dataset = dataset.iloc[::skip_points,:]
-
-
-dataset.to_csv(dataset_path + 'chapter4_result.csv')
-
-DataViz.plot_dataset(dataset, ['acc_phone_x', 'gyr_phone_x', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'press_phone_', 'pca_1', 'label'], ['like', 'like', 'like', 'like', 'like', 'like', 'like','like'], ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points'])
diff --git a/PythonCode/crowdsignals_ch5.py b/PythonCode/crowdsignals_ch5.py
deleted file mode 100644
index d3e8320f..00000000
--- a/PythonCode/crowdsignals_ch5.py
+++ /dev/null
@@ -1,127 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 5 #
-# #
-##############################################################
-
-from util.VisualizeDataset import VisualizeDataset
-from Chapter5.DistanceMetrics import InstanceDistanceMetrics
-from Chapter5.DistanceMetrics import PersonDistanceMetricsNoOrdering
-from Chapter5.DistanceMetrics import PersonDistanceMetricsOrdering
-from Chapter5.Clustering import NonHierarchicalClustering
-from Chapter5.Clustering import HierarchicalClustering
-import copy
-import pandas as pd
-import matplotlib.pyplot as plot
-import util.util as util
-
-
-# Of course we repeat some stuff from Chapter 3, namely to load the dataset
-
-DataViz = VisualizeDataset()
-
-# Read the result from the previous chapter, and make sure the index is of the type datetime.
-dataset_path = './intermediate_datafiles/'
-
-try:
- dataset = pd.read_csv(dataset_path + 'chapter4_result.csv', index_col=0)
-except IOError as e:
- print('File not found, try to run previous crowdsignals scripts first!')
- raise e
-dataset.index = dataset.index.to_datetime()
-
-# First let us use non hierarchical clustering.
-
-clusteringNH = NonHierarchicalClustering()
-
-# Let us look at k-means first.
-
-k_values = range(2, 10)
-silhouette_values = []
-#
-## Do some initial runs to determine the right number for k
-#
-print '===== kmeans clustering ====='
-for k in k_values:
- print 'k = ', k
- dataset_cluster = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 20, 10)
- silhouette_score = dataset_cluster['silhouette'].mean()
- print 'silhouette = ', silhouette_score
- silhouette_values.append(silhouette_score)
-
-plot.plot(k_values, silhouette_values, 'b-')
-plot.xlabel('k')
-plot.ylabel('silhouette score')
-plot.ylim([0,1])
-plot.show()
-
-# And run the knn with the highest silhouette score
-
-k = 6
-
-dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 50, 50)
-DataViz.plot_clusters_3d(dataset_knn, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label'])
-DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
-util.print_latex_statistics_clusters(dataset_knn, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label')
-del dataset_knn['silhouette']
-
-
-k_values = range(2, 10)
-silhouette_values = []
-
-# Do some initial runs to determine the right number for k
-
-print '===== k medoids clustering ====='
-for k in k_values:
- print 'k = ', k
- dataset_cluster = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 20, n_inits=10)
- silhouette_score = dataset_cluster['silhouette'].mean()
- print 'silhouette = ', silhouette_score
- silhouette_values.append(silhouette_score)
-
-plot.plot(k_values, silhouette_values, 'b-')
-plot.ylim([0,1])
-plot.xlabel('k')
-plot.ylabel('silhouette score')
-plot.show()
-
-# And run k medoids with the highest silhouette score
-
-k = 6
-
-dataset_kmed = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 20, n_inits=50)
-DataViz.plot_clusters_3d(dataset_kmed, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label'])
-DataViz.plot_silhouette(dataset_kmed, 'cluster', 'silhouette')
-util.print_latex_statistics_clusters(dataset_kmed, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label')
-
-# And the hierarchical clustering is the last one we try
-
-clusteringH = HierarchicalClustering()
-
-k_values = range(2, 10)
-silhouette_values = []
-
-# Do some initial runs to determine the right number for the maximum number of clusters.
-
-print '===== agglomaritive clustering ====='
-for k in k_values:
- print 'k = ', k
- dataset_cluster, l = clusteringH.agglomerative_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'euclidean', use_prev_linkage=True, link_function='ward')
- silhouette_score = dataset_cluster['silhouette'].mean()
- print 'silhouette = ', silhouette_score
- silhouette_values.append(silhouette_score)
- if k == k_values[0]:
- DataViz.plot_dendrogram(dataset_cluster, l)
-
-plot.plot(k_values, silhouette_values, 'b-')
-plot.ylim([0,1])
-plot.xlabel('max number of clusters')
-plot.ylabel('silhouette score')
-plot.show()
-
-# And we select the outcome dataset of the knn clustering....
-
-dataset_knn.to_csv(dataset_path + 'chapter5_result.csv')
diff --git a/PythonCode/crowdsignals_ch7_classification.py b/PythonCode/crowdsignals_ch7_classification.py
deleted file mode 100644
index 8ef85600..00000000
--- a/PythonCode/crowdsignals_ch7_classification.py
+++ /dev/null
@@ -1,241 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 7 #
-# #
-##############################################################
-
-from util.VisualizeDataset import VisualizeDataset
-from Chapter7.PrepareDatasetForLearning import PrepareDatasetForLearning
-from Chapter7.LearningAlgorithms import ClassificationAlgorithms
-from Chapter7.LearningAlgorithms import RegressionAlgorithms
-from Chapter7.Evaluation import ClassificationEvaluation
-from Chapter7.Evaluation import RegressionEvaluation
-from Chapter7.FeatureSelection import FeatureSelectionClassification
-from Chapter7.FeatureSelection import FeatureSelectionRegression
-import copy
-import pandas as pd
-from util import util
-import matplotlib.pyplot as plot
-import numpy as np
-from sklearn.model_selection import train_test_split
-import os
-
-
-# Of course we repeat some stuff from Chapter 3, namely to load the dataset
-
-DataViz = VisualizeDataset()
-
-# Read the result from the previous chapter, and make sure the index is of the type datetime.
-
-dataset_path = './intermediate_datafiles/'
-export_tree_path = 'Example_graphs/Chapter7/'
-
-try:
- dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0)
-except IOError as e:
- print('File not found, try to run previous crowdsignals scripts first!')
- raise e
-
-if not os.path.exists(export_tree_path):
- os.makedirs(export_tree_path)
-
-dataset.index = dataset.index.to_datetime()
-
-# Let us consider our first task, namely the prediction of the label. We consider this as a non-temporal task.
-
-# We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data
-# for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove
-# cases where we do not know the label.
-
-prepare = PrepareDatasetForLearning()
-
-train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(dataset, ['label'], 'like', 0.7, filter=True, temporal=False)
-#train_X, test_X, train_y, test_y = prepare.split_single_dataset_classification(dataset, ['label'], 'like', 0.01, filter=True, temporal=False)
-
-print 'Training set length is: ', len(train_X.index)
-print 'Test set length is: ', len(test_X.index)
-
-# Select subsets of the features that we will consider:
-
-basic_features = ['acc_phone_x','acc_phone_y','acc_phone_z','acc_watch_x','acc_watch_y','acc_watch_z','gyr_phone_x','gyr_phone_y','gyr_phone_z','gyr_watch_x','gyr_watch_y','gyr_watch_z',
- 'hr_watch_rate', 'light_phone_lux','mag_phone_x','mag_phone_y','mag_phone_z','mag_watch_x','mag_watch_y','mag_watch_z','press_phone_pressure']
-pca_features = ['pca_1','pca_2','pca_3','pca_4','pca_5','pca_6','pca_7']
-time_features = [name for name in dataset.columns if '_temp_' in name]
-freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]
-print '#basic features: ', len(basic_features)
-print '#PCA features: ', len(pca_features)
-print '#time features: ', len(time_features)
-print '#frequency features: ', len(freq_features)
-cluster_features = ['cluster']
-print '#cluster features: ', len(cluster_features)
-features_after_chapter_3 = list(set().union(basic_features, pca_features))
-features_after_chapter_4 = list(set().union(basic_features, pca_features, time_features, freq_features))
-features_after_chapter_5 = list(set().union(basic_features, pca_features, time_features, freq_features, cluster_features))
-
-
-# First, let us consider the performance over a selection of features:
-
-fs = FeatureSelectionClassification()
-
-features, ordered_features, ordered_scores = fs.forward_selection(50, train_X[features_after_chapter_5], train_y)
-print ordered_scores
-print ordered_features
-
-plot.plot(range(1, 51), ordered_scores)
-plot.xlabel('number of features')
-plot.ylabel('accuracy')
-plot.show()
-
-# Based on the plot we select the top 10 features.
-
-selected_features = ['acc_phone_y_freq_0.0_Hz_ws_40', 'press_phone_pressure_temp_mean_ws_120', 'gyr_phone_x_temp_std_ws_120',
- 'mag_watch_y_pse', 'mag_phone_z_max_freq', 'gyr_watch_y_freq_weighted', 'gyr_phone_y_freq_1.0_Hz_ws_40',
- 'acc_phone_x_freq_1.9_Hz_ws_40', 'mag_watch_z_freq_0.9_Hz_ws_40', 'acc_watch_y_freq_0.5_Hz_ws_40']
-
-# Let us first study the impact of regularization and model complexity: does regularization prevent overfitting?
-
-learner = ClassificationAlgorithms()
-eval = ClassificationEvaluation()
-
-reg_parameters = [0.0001, 0.001, 0.01, 0.1, 1, 10]
-performance_training = []
-performance_test = []
-
-# We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random.
-
-repeats = 20
-
-for reg_param in reg_parameters:
- performance_tr = 0
- performance_te = 0
- for i in range(0, repeats):
- class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(train_X, train_y,
- test_X, hidden_layer_sizes=(250, ), alpha=reg_param, max_iter=500,
- gridsearch=False)
- performance_tr += eval.accuracy(train_y, class_train_y)
- performance_te += eval.accuracy(test_y, class_test_y)
- performance_training.append(performance_tr/repeats)
- performance_test.append(performance_te/repeats)
-
-plot.hold(True)
-plot.semilogx(reg_parameters, performance_training, 'r-')
-plot.semilogx(reg_parameters, performance_test, 'b:')
-print performance_training
-print performance_test
-plot.xlabel('regularization parameter value')
-plot.ylabel('accuracy')
-plot.ylim([0.95, 1.01])
-plot.legend(['training', 'test'], loc=4)
-plot.hold(False)
-
-plot.show()
-
-# Second, let us consider the influence of certain parameter settings (very related to the regulariztion) and study the impact on performance.
-
-leaf_settings = [1,2,5,10]
-performance_training = []
-performance_test = []
-
-for no_points_leaf in leaf_settings:
- class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features], min_samples_leaf=no_points_leaf,
- gridsearch=False, print_model_details=False)
- performance_training.append(eval.accuracy(train_y, class_train_y))
- performance_test.append(eval.accuracy(test_y, class_test_y))
-
-plot.hold(True)
-plot.plot(leaf_settings, performance_training, 'r-')
-plot.plot(leaf_settings, performance_test, 'b:')
-plot.xlabel('minimum number of points per leaf')
-plot.ylabel('accuracy')
-plot.legend(['training', 'test'], loc=1)
-plot.hold(False)
-
-plot.show()
-
-
-# So yes, it is important :) Therefore we perform grid searches over the most important parameters, and do so by means
-# of cross validation upon the training set.
-
-
-possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features]
-feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features']
-repeats = 5
-
-scores_over_all_algs = []
-
-for i in range(0, len(possible_feature_sets)):
- selected_train_X = train_X[possible_feature_sets[i]]
- selected_test_X = test_X[possible_feature_sets[i]]
-
- # First we run our non deterministic classifiers a number of times to average their score.
-
- performance_tr_nn = 0
- performance_tr_rf = 0
- performance_tr_svm = 0
- performance_te_nn = 0
- performance_te_rf = 0
- performance_te_svm = 0
-
- for repeat in range(0, repeats):
- class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.feedforward_neural_network(selected_train_X, train_y, selected_test_X, gridsearch=True)
- performance_tr_nn += eval.accuracy(train_y, class_train_y)
- performance_te_nn += eval.accuracy(test_y, class_test_y)
-
- class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(selected_train_X, train_y, selected_test_X, gridsearch=True)
- performance_tr_rf += eval.accuracy(train_y, class_train_y)
- performance_te_rf += eval.accuracy(test_y, class_test_y)
-
- class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.support_vector_machine_with_kernel(selected_train_X, train_y, selected_test_X, gridsearch=True)
- performance_tr_svm += eval.accuracy(train_y, class_train_y)
- performance_te_svm += eval.accuracy(test_y, class_test_y)
-
-
- overall_performance_tr_nn = performance_tr_nn/repeats
- overall_performance_te_nn = performance_te_nn/repeats
- overall_performance_tr_rf = performance_tr_rf/repeats
- overall_performance_te_rf = performance_te_rf/repeats
- overall_performance_tr_svm = performance_tr_svm/repeats
- overall_performance_te_svm = performance_te_svm/repeats
-
- # And we run our deterministic classifiers:
-
-
- class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True)
- performance_tr_knn = eval.accuracy(train_y, class_train_y)
- performance_te_knn = eval.accuracy(test_y, class_test_y)
-
- class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True)
- performance_tr_dt = eval.accuracy(train_y, class_train_y)
- performance_te_dt = eval.accuracy(test_y, class_test_y)
-
- class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.naive_bayes(selected_train_X, train_y, selected_test_X)
- performance_tr_nb = eval.accuracy(train_y, class_train_y)
- performance_te_nb = eval.accuracy(test_y, class_test_y)
-
- scores_with_sd = util.print_table_row_performances(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), [
- (overall_performance_tr_nn, overall_performance_te_nn),
- (overall_performance_tr_rf, overall_performance_te_rf),
- (overall_performance_tr_svm, overall_performance_te_svm),
- (performance_tr_knn, performance_te_knn),
- (performance_tr_dt, performance_te_dt),
- (performance_tr_nb, performance_te_nb)])
- scores_over_all_algs.append(scores_with_sd)
-
-DataViz.plot_performances_classification(['NN', 'RF', 'SVM', 'KNN', 'DT', 'NB'], feature_names, scores_over_all_algs)
-
-# And we study two promising ones in more detail. First let us consider the decision tree which works best with the selected
-# features.
-#
-class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.decision_tree(train_X[selected_features], train_y, test_X[selected_features],
- gridsearch=True,
- print_model_details=True, export_tree_path=export_tree_path)
-
-class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = learner.random_forest(train_X[selected_features], train_y, test_X[selected_features],
- gridsearch=True, print_model_details=True)
-
-test_cm = eval.confusion_matrix(test_y, class_test_y, class_train_prob_y.columns)
-
-DataViz.plot_confusion_matrix(test_cm, class_train_prob_y.columns, normalize=False)
diff --git a/PythonCode/crowdsignals_ch7_regression.py b/PythonCode/crowdsignals_ch7_regression.py
deleted file mode 100644
index 44fbedfb..00000000
--- a/PythonCode/crowdsignals_ch7_regression.py
+++ /dev/null
@@ -1,188 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 7 #
-# #
-##############################################################
-
-from util.VisualizeDataset import VisualizeDataset
-from Chapter7.PrepareDatasetForLearning import PrepareDatasetForLearning
-from Chapter7.LearningAlgorithms import ClassificationAlgorithms
-from Chapter7.LearningAlgorithms import RegressionAlgorithms
-from Chapter7.Evaluation import ClassificationEvaluation
-from Chapter7.Evaluation import RegressionEvaluation
-from Chapter7.FeatureSelection import FeatureSelectionClassification
-from Chapter7.FeatureSelection import FeatureSelectionRegression
-import copy
-import pandas as pd
-from util import util
-import matplotlib.pyplot as plot
-import numpy as np
-from sklearn.model_selection import train_test_split
-import os
-
-
-# Of course we repeat some stuff from Chapter 3, namely to load the dataset
-
-DataViz = VisualizeDataset()
-
-# Read the result from the previous chapter, and make sure the index is of the type datetime.
-dataset_path = './intermediate_datafiles/'
-export_tree_path = 'Example_graphs/Chapter7/'
-
-try:
- dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0)
-except IOError as e:
- print('File not found, try to run previous crowdsignals scripts first!')
- raise e
-dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0)
-dataset.index = dataset.index.to_datetime()
-
-if not os.path.exists(export_tree_path):
- os.makedirs(export_tree_path)
-
-# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.
-
-prepare = PrepareDatasetForLearning()
-
-train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'hr_watch_rate', '2016-02-08 18:28:56',
- '2016-02-08 19:34:07', '2016-02-08 20:07:50')
-# '2016-02-08 18:28:58','2016-02-08 18:28:59')
-
-print 'Training set length is: ', len(train_X.index)
-print 'Test set length is: ', len(test_X.index)
-
-# Select subsets of the features that we will consider:
-
-basic_features = ['acc_phone_x','acc_phone_y','acc_phone_z','acc_watch_x','acc_watch_y','acc_watch_z','gyr_phone_x','gyr_phone_y','gyr_phone_z','gyr_watch_x','gyr_watch_y','gyr_watch_z',
- 'labelOnTable','labelSitting','labelWashingHands','labelWalking','labelStanding','labelDriving','labelEating','labelRunning',
- 'light_phone_lux','mag_phone_x','mag_phone_y','mag_phone_z','mag_watch_x','mag_watch_y','mag_watch_z','press_phone_pressure']
-pca_features = ['pca_1','pca_2','pca_3','pca_4','pca_5','pca_6','pca_7']
-time_features = [name for name in dataset.columns if ('temp_' in name and not 'hr_watch' in name)]
-freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]
-print '#basic features: ', len(basic_features)
-print '#PCA features: ', len(pca_features)
-print '#time features: ', len(time_features)
-print '#frequency features: ', len(freq_features)
-cluster_features = ['cluster']
-print '#cluster features: ', len(cluster_features)
-features_after_chapter_3 = list(set().union(basic_features, pca_features))
-features_after_chapter_4 = list(set().union(basic_features, pca_features, time_features, freq_features))
-features_after_chapter_5 = list(set().union(basic_features, pca_features, time_features, freq_features, cluster_features))
-
-fs = FeatureSelectionRegression()
-
-# First, let us consider the Pearson correlations and see whether we can select based on them.
-features, correlations = fs.pearson_selection(10, train_X[features_after_chapter_5], train_y)
-util.print_pearson_correlations(correlations)
-
-# We select the 10 features with the highest correlation.
-
-selected_features = ['temp_pattern_labelOnTable','labelOnTable','temp_pattern_labelOnTable(b)labelOnTable','pca_2_temp_mean_ws_120',
- 'pca_1_temp_mean_ws_120','acc_watch_y_temp_mean_ws_120','pca_2','acc_phone_z_temp_mean_ws_120',
- 'gyr_watch_y_pse','gyr_watch_x_pse']
-
-possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features]
-feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features']
-
-# Let us first study the importance of the parameter settings.
-
-learner = RegressionAlgorithms()
-eval = RegressionEvaluation()
-
-# We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random.
-
-repeats = 5
-
-scores_over_all_algs = []
-
-for i in range(0, len(possible_feature_sets)):
-
- selected_train_X = train_X[possible_feature_sets[i]]
- selected_test_X = test_X[possible_feature_sets[i]]
-
- # First we run our non deterministic classifiers a number of times to average their score.
-
- performance_tr_nn = 0
- performance_tr_nn_std = 0
- performance_tr_rf = 0
- performance_tr_rf_std = 0
- performance_tr_svm = 0
- performance_tr_svm_std = 0
- performance_te_nn = 0
- performance_te_nn_std = 0
- performance_te_rf = 0
- performance_te_rf_std = 0
- performance_te_svm = 0
- performance_te_svm_std = 0
-
- for repeat in range(0, repeats):
- regr_train_y, regr_test_y = learner.feedforward_neural_network(selected_train_X, train_y, selected_test_X, gridsearch=True)
-
- mean_tr, std_tr = eval.mean_squared_error_with_std(train_y, regr_train_y)
- mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y)
- mean_training = eval.mean_squared_error(train_y, regr_train_y)
- performance_tr_nn += mean_tr
- performance_tr_nn_std += std_tr
- performance_te_nn += mean_te
- performance_te_nn_std += std_te
-
- regr_train_y, regr_test_y = learner.random_forest(selected_train_X, train_y, selected_test_X, gridsearch=True)
- mean_tr, std_tr = eval.mean_squared_error_with_std(train_y, regr_train_y)
- mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y)
- performance_tr_rf += mean_tr
- performance_tr_rf_std += std_tr
- performance_te_rf += mean_te
- performance_te_rf_std += std_te
-
- overall_performance_tr_nn = performance_tr_nn/repeats
- overall_performance_tr_nn_std = performance_tr_nn_std/repeats
- overall_performance_te_nn = performance_te_nn/repeats
- overall_performance_te_nn_std = performance_te_nn_std/repeats
- overall_performance_tr_rf = performance_tr_rf/repeats
- overall_performance_tr_rf_std = performance_tr_rf_std/repeats
- overall_performance_te_rf = performance_te_rf/repeats
- overall_performance_te_rf_std = performance_te_rf_std/repeats
-
- # And we run our deterministic algorithms:
-
- regr_train_y, regr_test_y = learner.support_vector_regression_without_kernel(selected_train_X, train_y, selected_test_X, gridsearch=True)
- mean_tr, std_tr = eval.mean_squared_error_with_std(train_y, regr_train_y)
- mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y)
- performance_tr_svm = mean_tr
- performance_tr_svm_std = std_tr
- performance_te_svm = mean_te
- performance_te_svm_std = std_te
-
- regr_train_y, regr_test_y = learner.k_nearest_neighbor(selected_train_X, train_y, selected_test_X, gridsearch=True)
- mean_tr, std_tr = eval.mean_squared_error_with_std(train_y, regr_train_y)
- mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y)
- performance_tr_knn = mean_tr
- performance_tr_knn_std = std_tr
- performance_te_knn = mean_te
- performance_te_knn_std = std_te
-
- regr_train_y, regr_test_y = learner.decision_tree(selected_train_X, train_y, selected_test_X, gridsearch=True, export_tree_path=export_tree_path)
-
- mean_tr, std_tr = eval.mean_squared_error_with_std(train_y, regr_train_y)
- mean_te, std_te = eval.mean_squared_error_with_std(test_y, regr_test_y)
- performance_tr_dt = mean_tr
- performance_tr_dt_std = std_tr
- performance_te_dt = mean_te
- performance_te_dt_std = std_te
-
- scores_with_sd = [(overall_performance_tr_nn, overall_performance_tr_nn_std, overall_performance_te_nn, overall_performance_te_nn_std),
- (overall_performance_tr_rf, overall_performance_tr_rf_std, overall_performance_te_rf, overall_performance_te_rf_std),
- (performance_tr_svm, performance_tr_svm_std, performance_te_svm, performance_te_svm_std),
- (performance_tr_knn, performance_tr_knn_std, performance_te_knn, performance_te_knn_std),
- (performance_tr_dt, performance_tr_dt_std, performance_te_dt, performance_te_dt_std)]
- util.print_table_row_performances_regression(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), scores_with_sd)
- scores_over_all_algs.append(scores_with_sd)
-
-print scores_over_all_algs
-DataViz.plot_performances_regression(['NN', 'RF', 'SVM', 'KNN', 'DT'], feature_names, scores_over_all_algs)
-
-regr_train_y, regr_test_y = learner.random_forest(train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5], gridsearch=True, print_model_details=True)
-DataViz.plot_numerical_prediction_versus_real(train_X.index, train_y, regr_train_y, test_X.index, test_y, regr_test_y, 'heart rate')
diff --git a/PythonCode/crowdsignals_ch8_regression.py b/PythonCode/crowdsignals_ch8_regression.py
deleted file mode 100644
index 96f6eb77..00000000
--- a/PythonCode/crowdsignals_ch8_regression.py
+++ /dev/null
@@ -1,183 +0,0 @@
-##############################################################
-# #
-# Mark Hoogendoorn and Burkhardt Funk (2017) #
-# Machine Learning for the Quantified Self #
-# Springer #
-# Chapter 8 #
-# #
-##############################################################
-
-from util.VisualizeDataset import VisualizeDataset
-from Chapter7.PrepareDatasetForLearning import PrepareDatasetForLearning
-from Chapter7.Evaluation import RegressionEvaluation
-from Chapter8.LearningAlgorithmsTemporal import TemporalClassificationAlgorithms
-from Chapter8.LearningAlgorithmsTemporal import TemporalRegressionAlgorithms
-from statsmodels.tsa.stattools import adfuller
-from pandas.tools.plotting import autocorrelation_plot
-
-import copy
-import pandas as pd
-from util import util
-import matplotlib.pyplot as plot
-import numpy as np
-from sklearn.model_selection import train_test_split
-
-
-# Of course we repeat some stuff from Chapter 3, namely to load the dataset
-
-DataViz = VisualizeDataset()
-
-# Read the result from the previous chapter, and make sure the index is of the type datetime.
-dataset_path = './intermediate_datafiles/'
-
-try:
- dataset = pd.read_csv(dataset_path + 'chapter5_result.csv', index_col=0)
-except IOError as e:
- print('File not found, try to run previous crowdsignals scripts first!')
- raise e
-
-dataset.index = dataset.index.to_datetime()
-
-# Let us consider our second task, namely the prediction of the heart rate. We consider this as a temporal task.
-
-prepare = PrepareDatasetForLearning()
-
-train_X, test_X, train_y, test_y = prepare.split_single_dataset_regression_by_time(dataset, 'hr_watch_rate', '2016-02-08 18:29:56',
-# '2016-02-08 18:29:58','2016-02-08 18:29:59')
- '2016-02-08 19:34:07', '2016-02-08 20:07:50')
-
-print 'Training set length is: ', len(train_X.index)
-print 'Test set length is: ', len(test_X.index)
-
-# Select subsets of the features that we will consider:
-
-print 'Training set length is: ', len(train_X.index)
-print 'Test set length is: ', len(test_X.index)
-
-# Select subsets of the features that we will consider:
-
-basic_features = ['acc_phone_x','acc_phone_y','acc_phone_z','acc_watch_x','acc_watch_y','acc_watch_z','gyr_phone_x','gyr_phone_y','gyr_phone_z','gyr_watch_x','gyr_watch_y','gyr_watch_z',
- 'labelOnTable','labelSitting','labelWashingHands','labelWalking','labelStanding','labelDriving','labelEating','labelRunning',
- 'light_phone_lux','mag_phone_x','mag_phone_y','mag_phone_z','mag_watch_x','mag_watch_y','mag_watch_z','press_phone_pressure']
-pca_features = ['pca_1','pca_2','pca_3','pca_4','pca_5','pca_6','pca_7']
-time_features = [name for name in dataset.columns if ('temp_' in name and not 'hr_watch' in name)]
-freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]
-print '#basic features: ', len(basic_features)
-print '#PCA features: ', len(pca_features)
-print '#time features: ', len(time_features)
-print '#frequency features: ', len(freq_features)
-cluster_features = ['cluster']
-print '#cluster features: ', len(cluster_features)
-features_after_chapter_3 = list(set().union(basic_features, pca_features))
-features_after_chapter_4 = list(set().union(basic_features, pca_features, time_features, freq_features))
-features_after_chapter_5 = list(set().union(basic_features, pca_features, time_features, freq_features, cluster_features))
-
-selected_features = ['temp_pattern_labelOnTable','labelOnTable', 'temp_pattern_labelOnTable(b)labelOnTable', 'cluster',
- 'pca_1_temp_mean_ws_120','pca_2_temp_mean_ws_120','pca_2','acc_watch_y_temp_mean_ws_120','gyr_watch_y_pse',
- 'gyr_watch_x_pse']
-possible_feature_sets = [basic_features, features_after_chapter_3, features_after_chapter_4, features_after_chapter_5, selected_features]
-feature_names = ['initial set', 'Chapter 3', 'Chapter 4', 'Chapter 5', 'Selected features']
-
-# Let us first study whether the time series is stationary and what the autocorrelations are.
-
-dftest = adfuller(dataset['hr_watch_rate'], autolag='AIC')
-print dftest
-
-autocorrelation_plot(dataset['hr_watch_rate'])
-plot.show()
-
-# Now let us focus on the learning part.
-
-learner = TemporalRegressionAlgorithms()
-eval = RegressionEvaluation()
-
-# We repeat the experiment a number of times to get a bit more robust data as the initialization of the NN is random.
-
-repeats = 5
-
-# we set a washout time to give the NN's the time to stabilize. We do not compute the error during the washout time.
-
-washout_time = 10
-
-scores_over_all_algs = []
-
-for i in range(0, len(possible_feature_sets)):
-
- selected_train_X = train_X[possible_feature_sets[i]]
- selected_test_X = test_X[possible_feature_sets[i]]
-
- # First we run our non deterministic classifiers a number of times to average their score.
-
- performance_tr_res = 0
- performance_tr_res_std = 0
- performance_te_res = 0
- performance_te_res_std = 0
- performance_tr_rnn = 0
- performance_tr_rnn_std = 0
- performance_te_rnn = 0
- performance_te_rnn_std = 0
-
- for repeat in range(0, repeats):
- print '----', repeat
- regr_train_y, regr_test_y = learner.reservoir_computing(selected_train_X, train_y, selected_test_X, test_y, gridsearch=True, per_time_step=False)
-
- mean_tr, std_tr = eval.mean_squared_error_with_std(train_y.ix[washout_time:,], regr_train_y.ix[washout_time:,])
- mean_te, std_te = eval.mean_squared_error_with_std(test_y.ix[washout_time:,], regr_test_y.ix[washout_time:,])
-
- performance_tr_res += mean_tr
- performance_tr_res_std += std_tr
- performance_te_res += mean_te
- performance_te_res_std += std_te
-
- regr_train_y, regr_test_y = learner.recurrent_neural_network(selected_train_X, train_y, selected_test_X, test_y, gridsearch=True)
-
- mean_tr, std_tr = eval.mean_squared_error_with_std(train_y.ix[washout_time:,], regr_train_y.ix[washout_time:,])
- mean_te, std_te = eval.mean_squared_error_with_std(test_y.ix[washout_time:,], regr_test_y.ix[washout_time:,])
-
- performance_tr_rnn += mean_tr
- performance_tr_rnn_std += std_tr
- performance_te_rnn += mean_te
- performance_te_rnn_std += std_te
-
-
- # We only apply the time series in case of the basis features.
- if (feature_names[i] == 'initial set'):
- regr_train_y, regr_test_y = learner.time_series(selected_train_X, train_y, selected_test_X, test_y, gridsearch=True)
-
- mean_tr, std_tr = eval.mean_squared_error_with_std(train_y.ix[washout_time:,], regr_train_y.ix[washout_time:,])
- mean_te, std_te = eval.mean_squared_error_with_std(test_y.ix[washout_time:,], regr_test_y.ix[washout_time:,])
-
- overall_performance_tr_ts = mean_tr
- overall_performance_tr_ts_std = std_tr
- overall_performance_te_ts = mean_te
- overall_performance_te_ts_std = std_te
- else:
- overall_performance_tr_ts = 0
- overall_performance_tr_ts_std = 0
- overall_performance_te_ts = 0
- overall_performance_te_ts_std = 0
-
- overall_performance_tr_res = performance_tr_res/repeats
- overall_performance_tr_res_std = performance_tr_res_std/repeats
- overall_performance_te_res = performance_te_res/repeats
- overall_performance_te_res_std = performance_te_res_std/repeats
- overall_performance_tr_rnn = performance_tr_rnn/repeats
- overall_performance_tr_rnn_std = performance_tr_rnn_std/repeats
- overall_performance_te_rnn = performance_te_rnn/repeats
- overall_performance_te_rnn_std = performance_te_rnn_std/repeats
-
- scores_with_sd = [(overall_performance_tr_res, overall_performance_tr_res_std, overall_performance_te_res, overall_performance_te_res_std),
- (overall_performance_tr_rnn, overall_performance_tr_rnn_std, overall_performance_te_rnn, overall_performance_te_rnn_std),
- (overall_performance_tr_ts, overall_performance_tr_ts_std, overall_performance_te_ts, overall_performance_te_ts_std)]
- print scores_with_sd
- util.print_table_row_performances_regression(feature_names[i], len(selected_train_X.index), len(selected_test_X.index), scores_with_sd)
- scores_over_all_algs.append(scores_with_sd)
-
-DataViz.plot_performances_regression(['Reservoir', 'RNN', 'Time series'], feature_names, scores_over_all_algs)
-
-regr_train_y, regr_test_y = learner.reservoir_computing(train_X[features_after_chapter_5], train_y, test_X[features_after_chapter_5], test_y, gridsearch=True)
-DataViz.plot_numerical_prediction_versus_real(train_X.index, train_y, regr_train_y['hr_watch_rate'], test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate')
-regr_train_y, regr_test_y = learner.recurrent_neural_network(train_X[basic_features], train_y, test_X[basic_features], test_y, gridsearch=True)
-DataViz.plot_numerical_prediction_versus_real(train_X.index, train_y, regr_train_y['hr_watch_rate'], test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate')
-regr_train_y, regr_test_y = learner.time_series(train_X[basic_features], train_y, test_X[features_after_chapter_5], test_y, gridsearch=True)
-DataViz.plot_numerical_prediction_versus_real(train_X.index, train_y, regr_train_y['hr_watch_rate'], test_X.index, test_y, regr_test_y['hr_watch_rate'], 'heart rate')
diff --git a/PythonCode/util/VisualizeDataset.py b/PythonCode/util/VisualizeDataset.py
deleted file mode 100644
index a7e7345e..00000000
--- a/PythonCode/util/VisualizeDataset.py
+++ /dev/null
@@ -1,365 +0,0 @@
-import matplotlib.pyplot as plot
-import matplotlib.dates as md
-import numpy as np
-import pandas as pd
-from mpl_toolkits.mplot3d import Axes3D
-import matplotlib.patches as mpatches
-import matplotlib.cm as cm
-from scipy.cluster.hierarchy import dendrogram
-import itertools
-from scipy.optimize import curve_fit
-import math
-import sys
-import dateutil
-
-class VisualizeDataset:
-
- point_displays = ['+', 'x'] #'*', 'd', 'o', 's', '<', '>']
- line_displays = ['-'] #, '--', ':', '-.']
- colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
-
- # Plot the dataset, here columns can specify a specific attribute, but also a generic name that occurs
- # among multiple attributes (e.g. label which occurs as labelWalking, etc). In such a case they are plotted
- # in the same graph. The display should express whether points or a line should be plotted.
- # Match can be 'exact' or 'like'. Display can be 'points' or 'line'.
- def plot_dataset(self, data_table, columns, match='like', display='line'):
- names = list(data_table.columns)
-
- # Create subplots if more columns are specified.
- if len(columns) > 1:
- f, xar = plot.subplots(len(columns), sharex=True, sharey=False)
- else:
- f, xar = plot.subplots()
- xar = [xar]
-
- f.subplots_adjust(hspace=0.4)
- plot.hold(True)
- xfmt = md.DateFormatter('%H:%M')
-
- # Pass through the columns specified.
- for i in range(0, len(columns)):
- xar[i].xaxis.set_major_formatter(xfmt)
- # We can match exact (i.e. a columns name is an exact name of a columns or 'like' for
- # which we need to find columns names in the dataset that contain the name.
- if match[i] == 'exact':
- relevant_dataset_cols = [columns[i]]
- elif match[i] == 'like':
- relevant_dataset_cols = [name for name in names if columns[i] == name[0:len(columns[i])]]
- else:
- raise ValueError("Match should be 'exact' or 'like' for " + str(i) + ".")
-
- max_values = []
- min_values = []
- # Pass through the relevant columns.
- for j in range(0, len(relevant_dataset_cols)):
- # Create a mask to ignore the NaN values when plotting:
- mask = data_table[relevant_dataset_cols[j]].notnull()
- max_values.append(data_table[relevant_dataset_cols[j]][mask].max())
- min_values.append(data_table[relevant_dataset_cols[j]][mask].min())
-
- # Display point, or as a line
- if display[i] == 'points':
- xar[i].plot(data_table.index[mask], data_table[relevant_dataset_cols[j]][mask], self.point_displays[j%len(self.point_displays)])
- else:
- xar[i].plot(data_table.index[mask], data_table[relevant_dataset_cols[j]][mask], self.line_displays[j%len(self.line_displays)])
- xar[i].tick_params(axis='y', labelsize=10)
- xar[i].legend(relevant_dataset_cols, fontsize='xx-small', numpoints=1, loc='upper center', bbox_to_anchor=(0.5, 1.3), ncol=len(relevant_dataset_cols), fancybox=True, shadow=True)
- xar[i].set_ylim([min(min_values) - 0.1*(max(max_values) - min(min_values)), max(max_values) + 0.1*(max(max_values) - min(min_values))])
- # Make sure we get a nice figure with only a single x-axis and labels there.
- plot.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
- plot.xlabel('time')
- plot.show()
-
- def plot_dataset_boxplot(self, dataset, cols):
- ax = dataset[cols].plot.box()
- ax.set_ylim(-30,30)
- plot.show()
-
- # This function plots the real and imaginary amplitudes of the frequencies found in the Fourier transformation.
- def plot_fourier_amplitudes(self, freq, ampl_real, ampl_imag):
- plot.hold(True)
- plot.xlabel('Freq(Hz)')
- plot.ylabel('amplitude')
- # Plot the real values as a '+' and imaginary in the same way (though with a different color).
- plot.plot(freq, ampl_real, '+', freq, ampl_imag,'+')
- plot.legend(['real', 'imaginary'], numpoints=1)
- plot.hold(False)
- plot.show()
-
- # Plot outliers in case of a binary outlier score. Here, the col specifies the real data
- # column and outlier_col the columns with a binary value (outlier or not)
- def plot_binary_outliers(self, data_table, col, outlier_col):
- data_table = data_table.dropna(axis=0, subset=[col, outlier_col])
- data_table[outlier_col] = data_table[outlier_col].astype('bool')
- f, xar = plot.subplots()
- plot.hold(True)
- xfmt = md.DateFormatter('%H:%M')
- xar.xaxis.set_major_formatter(xfmt)
- plot.xlabel('time')
- plot.ylabel('value')
- # Plot data points that are outliers in red, and non outliers in blue.
- xar.plot(data_table.index[data_table[outlier_col]], data_table[col][data_table[outlier_col]], 'r+')
- xar.plot(data_table.index[~data_table[outlier_col]], data_table[col][~data_table[outlier_col]], 'b+')
- plot.legend(['outlier ' + col, 'no outlier' + col], numpoints=1, fontsize='xx-small', loc='upper center', ncol=2, fancybox=True, shadow=True)
- plot.hold(False)
- plot.show()
-
- # Plot values that have been imputed using one of our imputation approaches. Here, values expresses the
- # 1 to n datasets that have resulted from value imputation.
- def plot_imputed_values(self, data_table, names, col, *values):
-
- xfmt = md.DateFormatter('%H:%M')
-
- # Create proper subplots.
- if len(values) > 0:
- f, xar = plot.subplots(len(values) + 1, sharex=True, sharey=False)
- else:
- f, xar = plot.subplots()
- xar = [xar]
-
- f.subplots_adjust(hspace=0.4)
- plot.hold(True)
-
- # plot the regular dataset.
-
- xar[0].xaxis.set_major_formatter(xfmt)
- xar[0].plot(data_table.index[data_table[col].notnull()], data_table[col][data_table[col].notnull()], 'b+', markersize='2')
- xar[0].legend([names[0]], fontsize='small', numpoints=1, loc='upper center', bbox_to_anchor=(0.5, 1.3), ncol=1, fancybox=True, shadow=True)
-
- # and plot the others that have resulted from imputation.
- for i in range(1, len(values)+1):
- xar[i].xaxis.set_major_formatter(xfmt)
- xar[i].plot(data_table.index, values[i-1], 'b+', markersize='2')
- xar[i].legend([names[i]], fontsize='small', numpoints=1, loc='upper center', bbox_to_anchor=(0.5, 1.3), ncol=1, fancybox=True, shadow=True)
-
-
- # Diplay is nicely in subplots.
- plot.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
- plot.xlabel('time')
- plot.hold(False)
- plot.show()
-
- # This function plots clusters that result from the application of a clustering algorithm
- # and also shows the class label of points. Clusters are displayed via colors, classes
- # by means of different types of points. We assume that three data columns are clustered
- # that do not include the label. We assume the labels to be represented by 1 or more binary
- # columns.
- def plot_clusters_3d(self, data_table, data_cols, cluster_col, label_cols):
-
- color_index = 0
- point_displays = ['+', 'x', '*', 'd', 'o', 's', '<', '>']
-
- # Determine the number of clusters:
- clusters = data_table[cluster_col].unique()
- labels = []
-
- # Get the possible labels, assuming 1 or more label columns with binary values.
- for i in range(0, len(label_cols)):
- labels.extend([name for name in list(data_table.columns) if label_cols[i] == name[0:len(label_cols[i])]])
-
- fig = plot.figure()
- ax = fig.add_subplot(111, projection='3d')
- handles = []
-
- # Plot clusters individually with a certain color.
- for cluster in clusters:
- marker_index = 0
- # And make sure the points of a label receive the right marker type.
- for label in labels:
- rows = data_table.ix[(data_table[cluster_col] == cluster) & (data_table[label] > 0)]
- # Now we come to the assumption that there are three data_cols specified:
- if not len(data_cols) == 3:
- return
- plot_color = self.colors[color_index%len(self.colors)]
- plot_marker = point_displays[marker_index%len(point_displays)]
- pt = ax.scatter(rows[data_cols[0]], rows[data_cols[1]], rows[data_cols[2]], c=plot_color, marker=plot_marker)
- plot.hold(True)
- if color_index == 0:
- handles.append(pt)
- ax.set_xlabel(data_cols[0])
- ax.set_ylabel(data_cols[1])
- ax.set_zlabel(data_cols[2])
- marker_index += 1
- color_index += 1
-
- plot.legend(handles, labels, fontsize='xx-small', numpoints=1)
- plot.hold(False)
- plot.show()
-
- # This function plots the silhouettes of the different clusters that have been identified. It plots the
- # silhouette of the individual datapoints per cluster to allow studying the clusters internally as well.
- # For this, a column expressing the silhouette for each datapoint is assumed.
- def plot_silhouette(self, data_table, cluster_col, silhouette_col):
- # Taken from the examples of scikit learn
- #(http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html)
-
- clusters = data_table[cluster_col].unique()
-
- fig, ax1 = plot.subplots(1, 1)
- ax1.set_xlim([-0.1, 1])
- #ax1.set_ylim([0, len(data_table.index) + (len(clusters) + 1) * 10])
- y_lower = 10
- for i in range(0, len(clusters)):
- # Aggregate the silhouette scores for samples belonging to
- # cluster i, and sort them
- rows = data_table.mask(data_table[cluster_col] == clusters[i])
- ith_cluster_silhouette_values = np.array(rows[silhouette_col])
- ith_cluster_silhouette_values.sort()
-
- size_cluster_i = len(rows.index)
- y_upper = y_lower + size_cluster_i
-
- color = cm.spectral(float(i) / len(clusters))
- ax1.fill_betweenx(np.arange(y_lower, y_upper),
- 0, ith_cluster_silhouette_values,
- facecolor=color, edgecolor=color, alpha=0.7)
-
- # Label the silhouette plots with their cluster numbers at the middle
- ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
-
- # Compute the new y_lower for next plot
- y_lower = y_upper + 10 # 10 for the 0 samples
-
- ax1.set_title("The silhouette plot for the various clusters.")
- ax1.set_xlabel("The silhouette coefficient values")
- ax1.set_ylabel("Cluster label")
-
- # The vertical line for average silhouette score of all the values
- ax1.axvline(x=data_table[silhouette_col].mean(), color="red", linestyle="--")
-
- ax1.set_yticks([]) # Clear the yaxis labels / ticks
- ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
- plot.show()
-
- # Plot a dendorgram for hierarchical clustering. It assumes that the linkage as
- # used in sk learn is passed as an argument as well.
- def plot_dendrogram(self, dataset, linkage):
- sys.setrecursionlimit(40000)
- plot.title('Hierarchical Clustering Dendrogram')
- plot.xlabel('time points')
- plot.ylabel('distance')
- times = dataset.index.strftime('%H:%M:%S')
- #dendrogram(linkage,truncate_mode='lastp',p=10, show_leaf_counts=True, leaf_rotation=90.,leaf_font_size=12.,show_contracted=True, labels=times)
- dendrogram(linkage,truncate_mode='lastp',p=16, show_leaf_counts=True, leaf_rotation=45.,leaf_font_size=8.,show_contracted=True, labels=times)
- plot.show()
-
- # Plot the confusion matrix that has been derived in the evaluation metrics. Classes expresses the labels
- # for the matrix. We can normalize or show the raw counts. Of course this applies to classification problems.
- def plot_confusion_matrix(self, cm, classes, normalize=False):
- # Taken from http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
-
- # Select the colormap.
- cmap=plot.cm.Blues
- plot.imshow(cm, interpolation='nearest', cmap=cmap)
- plot.title('confusion matrix')
- plot.colorbar()
- tick_marks = np.arange(len(classes))
- plot.xticks(tick_marks, classes, rotation=45)
- plot.yticks(tick_marks, classes)
-
- if normalize:
- cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
-
- thresh = cm.max() / 2.
- for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
- plot.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
-
- plot.tight_layout()
- plot.ylabel('True label')
- plot.xlabel('Predicted label')
- plot.show()
-
- # This function plots the predictions or an algorithms (both for the training and test set) versus the real values for
- # a regression problem. It assumes only a single value to be predicted over a number of cases. The variables identified
- # with reg_ are the predictions.
- def plot_numerical_prediction_versus_real(self, train_time, train_y, regr_train_y, test_time, test_y, regr_test_y, label):
- self.legends = {}
- plot.title('Performance of model for ' + str(label))
-
- # Plot the values, training set cases in blue, test set in red.
- f, xar = plot.subplots()
- plot.hold(True)
- xfmt = md.DateFormatter('%H:%M')
- xar.xaxis.set_major_formatter(xfmt)
- xar.plot(train_time, train_y, '-', linewidth=0.5)
- xar.plot(train_time, regr_train_y, '--', linewidth=0.5)
-
- xar.plot(test_time, test_y, '-', linewidth=0.5)
- xar.plot(test_time, regr_test_y, '--', linewidth=0.5)
-
- plot.legend(['real values training', 'predicted values training', 'real values test', 'predicted values test'], loc=4)
-
-
- # And create some fancy stuff in the figure to label the training and test set a bit clearer.
- max_y_value = max(max(train_y.tolist()), max(regr_train_y.tolist()), max(test_y.tolist()), max(regr_test_y.tolist()))
- min_y_value = min(min(train_y.tolist()), min(regr_train_y.tolist()), min(test_y.tolist()), min(regr_test_y.tolist()))
- range = max_y_value - min_y_value
- y_coord_labels = max(max(train_y.tolist()), max(regr_train_y.tolist()), max(test_y.tolist()), max(regr_test_y.tolist()))+(0.01*range)
-
-
- plot.ylabel(label)
- plot.xlabel('time')
- plot.annotate('', xy=(train_time[0],y_coord_labels), xycoords='data', xytext=(train_time[-1], y_coord_labels), textcoords='data', arrowprops={'arrowstyle': '<->'})
- plot.annotate('training set', xy=(train_time[int(float(len(train_time))/2)], y_coord_labels*1.02), color='blue', xycoords='data', ha='center')
- plot.annotate('', xy=(test_time[0], y_coord_labels), xycoords='data', xytext=(test_time[-1], y_coord_labels), textcoords='data', arrowprops={'arrowstyle': '<->'})
- plot.annotate('test set', xy=(test_time[int(float(len(test_time))/2)], y_coord_labels*1.02), color='red', xycoords='data', ha='center')
- plot.hold(False)
- plot.show()
-
- # Plot the Pareto front for multi objective optimization problems (for the dynamical systems stuff). We consider the
- # raw output of the MO dynamical systems approach, which includes rows with the fitness and predictions for the training
- # and test set. We select the fitness and plot them in a graph. Note that the plot only considers the first two dimensions.
- def plot_pareto_front(self, dynsys_output):
- fit_1_train = []
- fit_2_train = []
- fit_1_test = []
- fit_2_test = []
- for row in dynsys_output:
- fit_1_train.append(row[1][0])
- fit_2_train.append(row[1][1])
- plot.hold(True)
-
- plot.scatter(fit_1_train, fit_2_train, color='r')
- plot.xlabel('mse on ' + str(dynsys_output[0][0].columns[0]))
- plot.ylabel('mse on ' + str(dynsys_output[0][0].columns[0]))
- #plt.savefig('{0} Example ({1}).pdf'.format(ea.__class__.__name__, problem.__class__.__name__), format='pdf')
- plot.show()
-
- # Plot a prediction for a regression model in case it concerns a multi-objective dynamical systems model. Here, we plot
- # the individual specified. Again, the complete output of the MO approach is used as argument.
- def plot_numerical_prediction_versus_real_dynsys_mo(self, train_time, train_y, test_time, test_y, dynsys_output, individual, label):
- regr_train_y = dynsys_output[individual][0][label]
- regr_test_y = dynsys_output[individual][2][label]
- train_y = train_y[label]
- test_y = test_y[label]
- self.plot_numerical_prediction_versus_real(train_time, train_y, regr_train_y, test_time, test_y, regr_test_y, label)
-
- # Visualizes the performance of different algorithms over different feature sets. Assumes the scores to contain
- # a score on the training set followed by an sd, and the same for the test set.
-
- def plot_performances(self, algs, feature_subset_names, scores_over_all_algs, ylim, std_mult, y_name):
-
- plot.hold(True)
- width = float(1)/(len(feature_subset_names)+1)
- ind = np.arange(len(algs))
- for i in range(0, len(feature_subset_names)):
- means = []
- std = []
- for j in range(0, len(algs)):
- means.append(scores_over_all_algs[i][j][2])
- std.append(std_mult * scores_over_all_algs[i][j][3])
- plot.errorbar(ind + i * width, means, yerr=std, fmt=self.colors[i%len(self.colors)] + 'o', markersize='3')
- plot.ylabel(y_name)
- plot.xticks(ind+(float(len(feature_subset_names))/2)*width, algs)
- plot.legend(feature_subset_names, loc=4, numpoints=1)
- if not ylim is None:
- plot.ylim(ylim)
- # plot.tight_layout()
- plot.savefig('perf_overview.png')
- plot.show()
-
- def plot_performances_classification(self, algs, feature_subset_names, scores_over_all_algs):
- self.plot_performances(algs, feature_subset_names, scores_over_all_algs, [0.70, 1.0], 2, 'Accuracy')
-
- def plot_performances_regression(self, algs, feature_subset_names, scores_over_all_algs):
- self.plot_performances(algs, feature_subset_names, scores_over_all_algs, None, 1, 'Mean Squared Error')
diff --git a/PythonCode/util/__init__.py b/PythonCode/util/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/PythonCode/util/util.py b/PythonCode/util/util.py
deleted file mode 100644
index f5cbf34f..00000000
--- a/PythonCode/util/util.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import scipy
-import copy
-import math
-import numpy as np
-# Not a class, just a bunch of useful functions.
-
-def normalize_dataset(data_table, columns):
- dt_norm = copy.deepcopy(data_table)
- for col in columns:
- dt_norm[col] = (data_table[col] - data_table[col].mean()) / (data_table[col].max() - data_table[col].min())
- return dt_norm
-
-# Calculate the distance between rows.
-def distance(rows, d_function='euclidean'):
- if d_function == 'euclidean':
- # Assumes m rows and n columns (attributes), returns and array where each row represents
- # the distances to the other rows (except the own row).
- return scipy.spatial.distance.pdist(rows, 'euclidean')
- else:
- raise ValueError("Unknown distance value '" + d_function + "'")
-
-def print_statistics(dataset):
- print 'column, fraction missing values, mean, standard deviation, min, max'
- dataset_length = len(dataset.index)
- for col in dataset.columns:
- print col,
- print float((dataset_length - dataset[col].count()))/dataset_length,
- print dataset[col].mean(),
- print dataset[col].std(),
- print dataset[col].min(),
- print dataset[col].max()
-
-def print_table_cell(value1, value2):
- print "{0:.2f}".format(value1), ' / ', "{0:.2f}".format(value2),
-
-def print_latex_table_statistics_two_datasets(dataset1, dataset2):
- print 'attribute, fraction missing values, mean, standard deviation, min, max'
- dataset1_length = len(dataset1.index)
- dataset2_length = len(dataset2.index)
- for col in dataset1.columns:
- print col, '&',
- print_table_cell((float((dataset1_length - dataset1[col].count()))/dataset1_length)*100, (float((dataset2_length - dataset2[col].count()))/dataset2_length)*100)
- print ' & ',
- print_table_cell(dataset1[col].mean(), dataset2[col].mean())
- print ' & ',
- print_table_cell(dataset1[col].std(), dataset2[col].std())
- print ' & ',
- print_table_cell(dataset1[col].min(), dataset2[col].min())
- print ' & ',
- print_table_cell(dataset1[col].max(), dataset2[col].max())
- print
-
-def print_latex_statistics_clusters(dataset, cluster_col, input_cols, label_col):
- label_cols = [c for c in dataset.columns if label_col == c[0:len(label_col)]]
-
- clusters = dataset[cluster_col].unique()
-
- for c in input_cols:
- print '\multirow{2}{*}{', c, '} & mean ',
- for cluster in clusters:
- print ' & ', "{0:.2f}".format(dataset.ix[dataset[cluster_col] == cluster, c].mean()),
- print '\\\\'
- print ' & std ',
- for cluster in clusters:
- print ' & ', "{0:.2f}".format(dataset.ix[dataset[cluster_col] == cluster, c].std()),
- print '\\\\'
-
- for l in label_cols:
- print l, ' & percentage ',
- for cluster in clusters:
- print ' & ', "{0:.2f}".format((float(dataset.ix[dataset[cluster_col] == cluster, l].sum())/len(dataset[dataset[l] == 1].index) * 100)), '\%',
- print '\\\\'
-
-def print_table_row_performances(row_name, training_len, test_len, values):
- scores_over_sd = []
- print row_name,
-
- for val in values:
- print ' & ',
- sd_train = math.sqrt((val[0]*(1-val[0]))/training_len)
- print "{0:.4f}".format(val[0]),
- print '\\emph{(', "{0:.4f}".format(val[0]-2*sd_train), '-', "{0:.4f}".format(val[0]+2*sd_train), ')}', ' & ',
- sd_test = math.sqrt((val[1]*(1-val[1]))/test_len)
- print "{0:.4f}".format(val[1]),
- print '\\emph{(', "{0:.4f}".format(val[1]-2*sd_test), '-', "{0:.4f}".format(val[1]+2*sd_test), ')}',
- scores_over_sd.append([val[0], sd_train, val[1], sd_test])
- print '\\\\\\hline'
- return scores_over_sd
-
-def print_table_row_performances_regression(row_name, training_len, test_len, values):
- print row_name,
-
- for val in values:
- print ' & ',
- print "{0:.4f}".format(val[0]),
- print '\\emph{(', "{0:.4f}".format(val[1]), ')}', ' & ',
- print "{0:.4f}".format(val[2]),
- print '\\emph{(', "{0:.4f}".format(val[3]), ')}',
- print '\\\\\\hline'
-
-def print_pearson_correlations(correlations):
- for i in range(0, len(correlations)):
- if np.isfinite(correlations[i][1]):
- print correlations[i][0], ' & ', "{0:.4f}".format(correlations[i][1]), '\\\\\\hline'
diff --git a/RCode/Chapter2/createDataset.R b/RCode/Chapter2/createDataset.R
deleted file mode 100644
index 76c35495..00000000
--- a/RCode/Chapter2/createDataset.R
+++ /dev/null
@@ -1,59 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# ./Chapter2/createDataset.R
-#
-##########################################################################
-
-addNumericalDataset = function(df, filename, timestampCol, valueCols, func, prefix, path, timeWindow) {
-# can be implemented in a more efficient manner using the aggregate() function on the data.table itself,
-# however, for the sake of comparability it is implemented as in the Python code
- print(paste("Load: ",path,filename,sep=""))
- dataset = fread(paste (path,filename,sep=""))
- tics2millisec = 1000000
- dataset[,(timestampCol):=dataset[,timestampCol,with=FALSE]/tics2millisec]
-
- cNames = paste(prefix,valueCols,sep="")
-
- if (is.null(df)) {
- # create aggregated data.frame and time intervals
- tMin = min(dataset[,timestampCol,with=FALSE])
- tMax = max(dataset[,timestampCol,with=FALSE])
- timeInts = tMin + timeWindow*(0:(floor(tMax-tMin)/timeWindow))
-
- df = data.frame(time = timeInts)
- }
- df[,cNames] = NA
-
- j = 1
- for (t in df$time) {
- relevantRows = dataset[timestamps>=t×tampsdf$time,value[i]] = 1 #which.max(value[i] == eventValues)
- }
- return(df)
-}
\ No newline at end of file
diff --git a/RCode/Chapter2/visiualizeData.R b/RCode/Chapter2/visiualizeData.R
deleted file mode 100644
index 67937144..00000000
--- a/RCode/Chapter2/visiualizeData.R
+++ /dev/null
@@ -1,58 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# ./chapter2/visualizeData.R
-#
-##########################################################################
-
-# Plot the dataset, here columns can specify a specific attribute, but also a generic name that occurs
-# among multiple attributes (e.g. label which occurs as labelWalking, etc). In such a case they are plotted
-# in the same graph.
-
-require(ggplot2)
-
-plotDataset = function(df, columns, match) {
- cNames = colnames(df)
- p = list()
- if (is.null(match)) match = rep("like",length(columns))
- for (i in 1:length(columns)) {
- if(match[i] == "exact") {
- selCols = columns[i]
- } else {
- selCols = cNames[grepl(columns[i],cNames)]
- }
- dfTemp = df[,c("time",selCols)]
- dfTemp = melt(dfTemp,id.vars = c("time"))
- p[[i]] = plot_ly(dfTemp,x = ~time, y = ~value) %>%
- add_lines(color = ~variable )
- }
- return(p)
-}
-
-# Plot outliers in case of a binary outlier score. Here, the col species the real data column and outlier_col
-# the columns with a binary value (outlier or not)
-plotBinaryOutliers = function(df, col, outlier_col) {
- plot(df$time,df[,col],type="l", col = "blue" ,ylab = "value", xlab = "time",main = outlier_col)
- points(df[df[,outlier_col],"time"] ,df[df[,outlier_col],col],pch = 3, col = "red",ylab = "value", xlab = "time")
- legend("topright",c(paste("outlier",col),paste("no outlier",col)), pch = 3, col = c("red", "blue"), cex = 1)
- grid()
-}
-
-plotImputedValues = function(df,col,imputedList) {
- opar = par()
- par(mfrow= c(length(imputedList),1))
- for (i in 1:length(imputedList)) {
- plot(df$time,df[,col],pch = 20,col = "blue",ylab=paste(""),xlab="time")
- points(df$time[is.na(df[,col])],imputedList[[i]][is.na(df[,col]),col],pch=20,col="red")
- print(colnames(imputedList[[i]]))
- }
- par(opar)
-}
-
-plotBoxplot = function(df, col) {
- ggplot(stack(df[,col]), aes(x = ind, y = values)) +
- geom_boxplot() +
- xlab("") + ylab("")
-}
diff --git a/RCode/Chapter3/distanceBasedOutlierDetection.R b/RCode/Chapter3/distanceBasedOutlierDetection.R
deleted file mode 100644
index 1c60fcb4..00000000
--- a/RCode/Chapter3/distanceBasedOutlierDetection.R
+++ /dev/null
@@ -1,34 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# ./chapter3/distanceBasedOutlierDetection.R
-#
-##########################################################################
-
-# The most simple distance based algorithm. We assume a distance function, e.g. 'euclidean'
-# and a minimum distance of neighboring points and frequency of occurrence.
-simpleDistanceBased = function(df, cols, dmin, fmin, dFunction = "euclidean") {
- # Normalize data
- data = scale(df[,cols])
- data = data[!is.na(data)]
-
- d = as.matrix(dist(data, method = dFunction))
- df[!is.na(df[,col]),paste(cols,"_simpleDistOutlier",sep="")] = sapply(1:nrow(d), function(x) {(sum(d[x,-x]>dmin)/nrow(d))>fmin})
- return(df)
-}
-
-# Compute the local outlier factor. K is the number of neighboring points considered, d_function
-# the distance function again (e.g. 'euclidean').
-localOutlierFactor = function(df, cols, dFunction = "euclidean", k, plot = FALSE) {
- data = scale(df[,cols])
- data = data[!is.na(data)]
-
- df[!is.na(df[,col]),paste(cols,"_lofOutlier",sep="")] = as.data.frame(lof(data,k,method=dFunction))
- if(plot) {
- plot.ecdf(df$lof, xlim = c(0,10),main = "LOF diagnostic")
- grid()
- }
- return(df)
-}
\ No newline at end of file
diff --git a/RCode/Chapter3/distributionBasedOutlierDetection.R b/RCode/Chapter3/distributionBasedOutlierDetection.R
deleted file mode 100644
index dc93e968..00000000
--- a/RCode/Chapter3/distributionBasedOutlierDetection.R
+++ /dev/null
@@ -1,49 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# ./chapter3/distributionBasedOutlierDetection.R
-#
-##########################################################################
-
-# Finds outliers in the specified column of datatable and adds a binary columns with
-# the same name extended with '_outlier' that expresses the result per data point.
-require(mclust)
-chauvenet = function(df, col) {
- data = df[,col]
- data = data[!is.na(data)]
- mean = mean(data)
- std = sd(data)
- n = nrow(df)
- criterion = 1.0/(2*n)
- deviation = abs(data - mean)/std
-
- df[!is.na(df[,col]),paste(col,"_chauvenetOutlier",sep="")] = ((1-pnorm(deviation))windowSize]
- k = 1
- for (i in instances2Analyze) {
- if(k%%100 ==0) print(paste("Instance: ",i))
- k = k +1
- for (col in cols) {
- dft = fft(df[(i-windowSize):(i-1),col])
- for (j in 1:length(freqs)) {
- df[i,paste(col,"_freq_",freqs[j],"_Hz_ws_",windowSize, sep="")] = Re(dft[j])
- }
- df[i,paste(col,"_max_freq",sep="")] = freqs[which.max(Re(dft))]
- df[i,paste(col,"_freq_weighted",sep="")] = sum(freqs * Re(dft)) / sum(Re(dft))
- pse = Re(dft)^2/windowSize
- pse = pse/sum(pse)
- df[i,paste(col,"_pse",sep="")] = -sum(log(pse)*pse)
- }
- }
- return(df)
-}
-
-# Remove periodic sinusoid functions from our data to be left with the "clean" signal
-# The components should be specified by means of their index (meaning their period).
-# removeComponents = function(df, col, components = 0) {
-# dft = fft(df[,col])
-# dft[components] = 0
-# df[,col] = Re(fft(dft, inverse = TRUE)/length(dft))
-# return(df)
-# }
diff --git a/RCode/Chapter4/temporalAbstraction.R b/RCode/Chapter4/temporalAbstraction.R
deleted file mode 100644
index 30cdeca3..00000000
--- a/RCode/Chapter4/temporalAbstraction.R
+++ /dev/null
@@ -1,130 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# Chapter 4 - Feature Engineering based on Sensory Data
-# ./chapter4/temporalAbstraction.R
-#
-##########################################################################
-
-# This function aggregates a list of values using the specified aggregation
-# function (which can be 'mean', 'max', 'min', 'median', 'std', 'slope')
-aggregateValue = function(data, aggregationFunction) {
- if (aggregationFunction == "mean") return(mean(data))
- if (aggregationFunction == "max") return(max(data))
- if (aggregationFunction == "min") return(min(data))
- if (aggregationFunction == "median") return(median(data))
- if (aggregationFunction == "std") return(sd(data))
- if (aggregationFunction == "slope") return(as.numeric(lm(data~(x=1:length(data)))$coef[2]))
- stop(paste("aggregationFunction:",aggregationFunction,"not found"))
-}
-
-
-abstractNumerical = function(df, cols, windowSize, aggregationFunction) {
- for (col in cols) {
- newColName = paste(col,"_temp_",aggregationFunction,"_ws_",windowSize,sep = "")
- data = df[,col]
- aggData = sapply((windowSize+1):length(data), function(x) {aggregateValue(data[(x-windowSize):x],aggregationFunction)})
- df[floor(windowSize/2)+(1:(length(data)-windowSize)),newColName] = aggData
- }
- return(df)
-}
-
-# Function to abstract categorical data. Note that we assume a list of binary columns representing
-# the different categories
-abstractCategorical = function(df, cols, match, minSupport, windowSize, maxPatternSize) {
- # find all the relevant columns of binary attributes.
- colNames = names(df)
- selectedPatterns = c()
- relevantCols = c()
-
- for (i in 1:length(cols)) {
- if(match[i]=="exact") relevantCols = c(relevantCols,cols[i]) else
- relevantCols = c(relevantCols, colNames[grepl(cols[i],colNames)])
- }
-
- # generate the one patterns first
- potential1Patterns = as.list(relevantCols)
- temp = selectKPatterns(df,potential1Patterns,minSupport,windowSize)
- df = temp[[1]]
- onePatterns = temp[[2]]
- selectedPatterns[[length(selectedPatterns)+1]] = onePatterns
- print(paste("Number of patterns of size 1:",length(onePatterns)))
-
- k = 1
- kPatterns = onePatterns
- while(k< maxPatternSize & length(kPatterns)>0) {
- k = k + 1
- potentialKPatterns = extendKPatterns(kPatterns,onePatterns)
- print(potentialKPatterns)
- temp = selectKPatterns(df, potentialKPatterns, minSupport, windowSize)
- df = temp[[1]]
- kPatterns = temp[[2]]
- selectedPatterns[[length(selectedPatterns)+1]] = kPatterns
- print(paste("Number of patterns of size",k,":",length(kPatterns)))
- }
- return(df)
-}
-
-# selects the patterns from 'patterns' that meet the minimum support in the dataset
-# given the window size.
-selectKPatterns = function(df, patterns, minSupport, windowSize){
- sP = list()
- for (pattern in patterns) {
- # determine the number of occurrences of a pattern
- times = determinePatternTimes(df, pattern, windowSize)
- support = length(times)/nrow(df)
-
- # If we meet the minum support, append the selected patterns and set the
- # value to 1 at which it occurs.
- if (support>=minSupport) {
- sP[[length(sP)+1]] = pattern
- df[,paste(c("temp_pattern",pattern),collapse = "_")] = 0
- df[times,paste(c("temp_pattern",pattern),collapse = "_")] = 1
- print(pattern)
- }
- }
- return(list(df,sP))
-}
-
-determinePatternTimes = function(df, pattern, windowSize){
- times = c()
- if (length(pattern) == 1) {
- times = which(df[pattern]>0)
- } else if(length(pattern) == 3) {
- # If we have a complex pattern ( (b) or (c) )
- # due to naming conventions, brackets should not be used
- # we therefore use ".b." instead of "(b)"
- timesFirstPart = determinePatternTimes(df, pattern[1], windowSize)
- timesSecondPart = determinePatternTimes(df, pattern[3], windowSize)
- if (pattern[2] == ".c.") {
- if (pattern[1]==pattern[3]) {
- # No use for co-occurences of the same patterns
- times = c()
- } else {
- times = intersect(timesFirstPart,timesSecondPart)
- }
- } else if (pattern[2] == ".b.") {
- times = unlist(sapply(timesFirstPart, function(t){
- if(sum(t0) return(t)
- }))
-
- } else stop("complex operator not known")
-
- } else stop("pattern length error")
-
- return(times)
-}
-
-# extends a set of k-patterns with the 1-patterns that have sufficient support.
-extendKPatterns = function(kPatterns, onePatterns) {
- newPatterns = list()
- for(kP in kPatterns) {
- for(oneP in onePatterns) {
- newPatterns[[length(newPatterns)+1]] = c(kP,".b.",oneP)
- newPatterns[[length(newPatterns)+1]] = c(kP,".c.",oneP)
- }
- }
- return(newPatterns)
-}
diff --git a/RCode/Chapter4/textAbstraction.R b/RCode/Chapter4/textAbstraction.R
deleted file mode 100644
index 7c5c4fc8..00000000
--- a/RCode/Chapter4/textAbstraction.R
+++ /dev/null
@@ -1,10 +0,0 @@
-generateCorpus = function(df, col , n = 1) {
- dfs = VectorSource(as.character(df[,col]))
- vc = VCorpus(dfs)
- vc = vc %>% tm_map(content_transformer(tolower)) %>%
- tm_map(removeWords, stopwords("english")) %>%
- tm_map(stemDocument) %>%
- tm_map(removePunctuation) %>%
- tm_map(stripWhitespace)
- return(vc)
-}
\ No newline at end of file
diff --git a/RCode/RCode.Rproj b/RCode/RCode.Rproj
deleted file mode 100644
index 3af27f6a..00000000
--- a/RCode/RCode.Rproj
+++ /dev/null
@@ -1,13 +0,0 @@
-Version: 1.0
-
-RestoreWorkspace: Default
-SaveWorkspace: Default
-AlwaysSaveHistory: Default
-
-EnableCodeIndexing: Yes
-UseSpacesForTab: Yes
-NumSpacesForTab: 2
-Encoding: UTF-8
-
-RnwWeave: Sweave
-LaTeX: pdfLaTeX
diff --git a/RCode/Support/.Rhistory b/RCode/Support/.Rhistory
deleted file mode 100644
index e69de29b..00000000
diff --git a/RCode/Support/labBook.R b/RCode/Support/labBook.R
deleted file mode 100644
index 4863c103..00000000
--- a/RCode/Support/labBook.R
+++ /dev/null
@@ -1,93 +0,0 @@
-##########################################################################
-## Hoogendoorn & Funk (2017) ML4QS Springer
-## Lab Book
-## Version 17/02/10
-##########################################################################
-
-source("./Support/util.R")
-
-## quick aggregation of time series
-require(data.table)
-require(bit64)
-require(zoo)
-datasetPath = '../datasets/crowdsignals.io/csv-participant-one/'
-filename = 'accelerometer_phone.csv'
-#filename = 'gyroscope_phone.csv'
-#filename = 'heart_rate_smartwatch.csv'
-
-ds = loadDataset(datasetPath,filename)
-tempTS = ds$timestamps[1]
-ds[,timestamps:=as.POSIXct(ds[,timestamps], origin="1970-01-01")]
-ts1 = zoo(ds[,c("x","y","z"),with=FALSE],order.by = ds[,timestamps])
-
-aggWindow = 0.05 # aggregates on X sec intervals
-ts2 = aggregate(ts1, time(ts1) - as.numeric(time(ts1)) %% aggWindow, mean)
-time(ts2[1:100,])-time(ts2[2:101,])
-plot(ts2[40000:42000,"x"],xlab="time",ylab="acc_x")
-plot(ts2[40000:40200,"x"],xlab="time",ylab="acc_x")
-
-#ts3 = ts(stats::filter(ts2$x,sides = 2, rep(1/10,10)),frequency=1/aggWindow)
-#ts3 = zoo(ts3,order.by = as.POSIXct(tempTS+(0:(length(ts3)-1))*aggWindow,origin="1970-01-01"))
-
-# https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/
-limTS = c(41000:42000)
-acf((ts2[limTS,"x"]),lag.max = 100)
-pacf((ts2[limTS,"x"]),lag.max = 100)
-(fit <- arima((ts2[limTS,"x"]),order = c(0,1,1), seasonal = list(order = c(0, 1, 0),period=22)))
-pred= predict(fit,n.ahead = 100)
-plot(1:100,as.numeric(ts2[41901:42000,"x"]), xlim=c(0,200), ylim=c(-15,15), ylab="x", xlab="time",type="l")
-lines(101:200,pred$pred,lty=3)
-lines(101:200,pred$pred+pred$se,lty=1,col="grey")
-lines(101:200,pred$pred-pred$se,lty=1,col="grey")
-grid()
-
-
-# Basic ts simulation and fitting
-temp.ts= arima.sim(n = 1000, list(order = c(1,0,1), ar = c(-.9),ma=0.3))
-acf(temp.ts,lag.max = 500)
-pacf(temp.ts,lag.max = 50)
-fit=arima(temp.ts,order = c(1,0,1))
-plot(forecast(fit,h = 10),include = 20)
-
-
-# Applying to crowdsignal raw data
-temp.ts = ds$x[400000:402000]
-acf(temp.ts,lag.max = 1000)
-pacf(temp.ts,lag.max = 200)
-# sarima.for(temp.ts,600,1,0,0,1,0,0,221)
-
-fit = auto.arima(temp.ts,max.p = 10,max.q = 10,seasonal = TRUE,stepwise = FALSE)
-summary(fit)
-
-plot.ts(temp.ts)
-lines(0:2000,fit$fitted,col="red")
-
-plot(forecast(fit,h = 40),include =400)
-lines(2001:2400,ds$x[402001:402400],col="red")
-lines(2001:2400,ds$x[401779:402178],col="green",lwd=2)
-
-# Applying to crowdsignal aggregated data
-load("50msAggregatedAccData.RData")
-temp.ts = dataset1$acc_phone_x[40001:40200]
-acf(temp.ts,lag.max = 100)
-pacf(temp.ts,lag.max = 50)
-
-aicMin =1e20
-for(p in 0:3) {
- for(q in 0:3) {
- fit = arima(temp.ts,order = c(p,0,q),seasonal = list(order = c(1,0,0),period=22))
- if(fit$aic 0,labels[df$class],NA)
- return(df)
-}
-
-# Split a dataset of a single person for a classificaiton problem with the specified class columns classLabels.
-# We can have multiple targets if we want. It assumes a list in 'classLabels'
-# If 'like' is specified in matching, we will merge the columns that contain the classLabels into a single
-# columns. We can select a filter for rows where we are unable to identifty a unique
-# class and we can select whether we have a temporal dataset or not. In the former, we will select the first
-# trainingFrac of the data for training and the last 1-trainingFrac for testing. Otherwise, we select points randomly.
-# We return a training set, the labels of the training set, and the same for a test set. We can set the random seed
-# to make the split reproducible.
-splitSingleDatasetClassification = function(df, classLabels, matching, trainingFrac, filter=TRUE, temporal = FALSE, randomState = 0) {
-
- # features are the ones not in the class label (include timestamp, or use & names(df)!="time")
- features = names(df)[!startsWith(names(df),classLabels)]
-
- # create a single class column if we have the 'like' option.
- if (matching == "like") {
- df = assignLabel(df, classLabels)
- classLabels = "class"
- }
-
- # filter NA is desired and those for which we cannot determine the class should be removed.
- if(filter) df = subset(df,!class==0) #[!is.na(df$class),]
-
- # For temporal data, we select the desired fraction of training data from the first part
- # and use the rest as test set.
- if (temporal) {
- endTrainingSet = int(trainingFrac * nrow(df))
- trainingSetX = df[1:endTrainingSet,features]
- trainingSety = df[1:endTrainingSet,classLabels]
- testSetX = df[(endTrainingSet+1):nrow(df),features]
- testSety = df[(endTrainingSet+1):nrow(df),classLabels]
- } else {
- # For non temporal data we use a standard function to randomly split the dataset.
- trainTestSplit = sample(nrow(df),trainingFrac*nrow(df))
- trainingSetX = df[trainTestSplit,features]
- trainingSety = df[trainTestSplit,classLabels]
- testSetX = df[-trainTestSplit,features]
- testSety = df[-trainTestSplit,classLabels]
- }
- return(list(trainingSetX = trainingSetX, trainingSety = trainingSety,
- testSetX = testSetX, testSety = testSety))
-}
\ No newline at end of file
diff --git a/RCode/crowdsignals_ch2.R b/RCode/crowdsignals_ch2.R
deleted file mode 100644
index 9d1ec99b..00000000
--- a/RCode/crowdsignals_ch2.R
+++ /dev/null
@@ -1,79 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# Chapter 2 - Sensory Data
-# ./crowdsignals_ch2.R
-#
-##########################################################################
-
-# Import libraries and source utils
-rm(list=ls())
-require(data.table)
-require(bit64)
-require(plotly)
-require(reshape2)
-source("./chapter2/createDataset.R")
-source("./chapter2/visiualizeData.R")
-
-# Set a granularity (i.e. how big are our discrete time steps). We start very
-# coarse grained, namely one measurement per minute, and secondly use four measurements
-# per second
-datasetPath = "../datasets/crowdsignals.io/csv-participant-one/"
-#granularities = c(60000,250)
-granularities = c(250)
-
-boxPlot = NULL
-linePlot = NULL
-for (timeWindow in granularities) {
-
- # We add the accelerometer data (continuous numerical measurements) of the phone and the smartwatch
- # and aggregate the values per timestep by averaging the values/
- crowdSignalData = addNumericalDataset(NULL, "accelerometer_phone.csv","timestamps", c("x","y","z"), "avg", "acc_phone_", datasetPath,timeWindow)
- crowdSignalData = addNumericalDataset(crowdSignalData, "accelerometer_smartwatch.csv","timestamps", c("x","y","z"), "avg", "acc_watch_", datasetPath,timeWindow)
-
- # We add the gyroscope data (continuous numerical measurements) of the phone and the smartwatch
- # and aggregate the values per timestep by averaging the values
- crowdSignalData = addNumericalDataset(crowdSignalData, "gyroscope_phone.csv","timestamps", c("x","y","z"), "avg", "gyr_phone_", datasetPath,timeWindow)
- crowdSignalData = addNumericalDataset(crowdSignalData, "gyroscope_smartwatch.csv","timestamps", c("x","y","z"), "avg", "gyr_watch_", datasetPath,timeWindow)
-
- # We add the heart rate (continuous numerical measurements) and aggregate by averaging again
- crowdSignalData = addNumericalDataset(crowdSignalData, "heart_rate_smartwatch.csv","timestamps", c("rate"), "avg", "hr_watch_", datasetPath,timeWindow)
-
- # We add the labels provided by the users. These are categorical events that might overlap. We add them
- # as binary attributes (i.e. ad a one to the attribute representing the specific value for the label if it
- # occurs within an interval).
- crowdSignalData = addEventDataset(crowdSignalData, "labels.csv", "label_start", "label_end", "label", "binary", datasetPath)
-
- # We add the amount of light sensed by the phone (continuous numerical measurements) and aggregate by averaging again
- crowdSignalData = addNumericalDataset(crowdSignalData, "light_phone.csv","timestamps", c("lux"), "avg", "light_phone_", datasetPath,timeWindow)
-
- # We add the magnetometer data (continuous numerical measurements) of the phone and the smartwatch
- # and aggregate the values per timestep by averaging the values/
- crowdSignalData = addNumericalDataset(crowdSignalData, "magnetometer_phone.csv","timestamps", c("x","y","z"), "avg", "mag_phone_", datasetPath,timeWindow)
- crowdSignalData = addNumericalDataset(crowdSignalData, "magnetometer_smartwatch.csv","timestamps", c("x","y","z"), "avg", "mag_watch_", datasetPath,timeWindow)
-
- # We add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
- crowdSignalData = addNumericalDataset(crowdSignalData, "pressure_phone.csv","timestamps", c("pressure"), "avg", "press_phone_", datasetPath,timeWindow)
- crowdSignalData$time = as.POSIXct(crowdSignalData$time/1000, origin="1970-01-01")
-
- # Lineplot
- # to plot, simply use print, e.g. "print(p[1])"
- temp = plotDataset(crowdSignalData, c("acc_", "gyr_", "hr_watch_rate", "light_phone_lux", "mag_", "press_phone_","label"),NULL)
- if (is.null(linePlot)) linePlot = subplot(temp,nrows = length(temp),shareX = TRUE) else
- linePlot = list(linePlot,subplot(temp,nrows = length(temp),shareX = TRUE))
-
- # Boxplot
- if(is.null(boxPlot)) boxPlot = list(plotBoxplot(crowdSignalData, c("acc_phone_x","acc_phone_y","acc_phone_z","acc_watch_x","acc_watch_y","acc_watch_z"))) else
- boxPlot = list(boxPlot,plotBoxplot(crowdSignalData, c("acc_phone_x","acc_phone_y","acc_phone_z","acc_watch_x","acc_watch_y","acc_watch_z")))
-}
-
-# Only the dataset with the last value for granularities is saved
-resultPath = "./intermediate_datafiles/"
-save(crowdSignalData, file=paste(resultPath,"chapter2_result.RData",sep=""))
-
-# Save as CSV file
-
-# crowdSignalData$time = as.character(crowdSignalData$time,format="%Y-%m-%d %H:%M:%OS4")
-# write.csv(x=crowdSignalData, paste(resultPath,"chapter2_result.csv",sep=""))
\ No newline at end of file
diff --git a/RCode/crowdsignals_ch3_outliers.R b/RCode/crowdsignals_ch3_outliers.R
deleted file mode 100644
index 412d5db6..00000000
--- a/RCode/crowdsignals_ch3_outliers.R
+++ /dev/null
@@ -1,86 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# Chapter 3 - Handling Noise and Missing Values in Sensory Data
-# ./crowdsignals_ch3_outliers.R
-#
-##########################################################################
-
-# Import libraries and source utils
-rm(list=ls())
-require(plotly)
-require(reshape2)
-require(zoo) # used for time series representation
-require(KFAS) # Kalman filtering and smoothing
-require(Rlof) # package for local outlier factor
-source("./chapter2/visiualizeData.R")
-source("./chapter3/distributionBasedOutlierDetection.R")
-source("./chapter3/distanceBasedOutlierDetection.R")
-
-# Read the result from the previous chapter, and make sture the index is of the type datetime
-resultPath = './intermediate_datafiles/'
-load(file=paste(resultPath,"chapter2_result.RData",sep=""))
-
-# Compute the number of milliseconds covered by an instane based on the first two rows
-timeWindow = as.numeric(difftime(crowdSignalData$time[2],crowdSignalData$time[1],units = "secs"))*1000
-
-# Determine the columns we want to experiment on.
-col2Inspect = c('acc_phone_x', 'light_phone_lux')
-
-# Investigate the approaches for all attributes specified in col2Inspect
-for (col in col2Inspect) {
- # Note that we have done some optimization of the parameter values for each of the
- # approaches by visual inspection
- print(paste("====",col,"===="))
-
- # Chauvenet criterion
- print(paste("Chauvenet:",Sys.time()))
- crowdSignalData = chauvenet(crowdSignalData, col)
- plotBinaryOutliers(crowdSignalData,col,paste(col,'_chauvenetOutlier',sep=""))
-
- # Mixture models
- print(paste("Mixture model:",Sys.time()))
- crowdSignalData = mixtureModel(crowdSignalData,col,numOfComponents = 3, plot = FALSE)
- crowdSignalData[,paste(col,"_mixtureOutlier",sep="")] = crowdSignalData[,paste(col,"_mixtureProb",sep="")]<0.0001
- plotBinaryOutliers(crowdSignalData,col,paste(col,"_mixtureOutlier",sep=""))
-
- # Distance based approaches require to calculate pair-wise distances, the computation time and
- # the memory requirements increase with N^2, for demonstration purposes we therefore reduce the sample size
- crowdSignalData = crowdSignalData[1:10000,]
-
- # Simple distance based detection
- print(paste("Simple distance based:",Sys.time()))
- crowdSignalData = simpleDistanceBased(crowdSignalData,col, 0.5, 0.99)
- plotBinaryOutliers(crowdSignalData,col,paste(col,"_simpleDistOutlier",sep=""))
-
- # Local Outlier Factor: we use the Rlof package which implements LOF
- print(paste("Local Outlier Factor:",Sys.time()))
- crowdSignalData = localOutlierFactor(crowdSignalData, col, dFunction = "euclidean", 5)
- crowdSignalData[, paste(col,"_lofOutlier",sep="")] = crowdSignalData$lof > 5
- plotBinaryOutliers(crowdSignalData, col, paste(col,"_lofOutlier",sep=""))
-}
-
-# Reload the result from Chapter 2 and apply Chauvenet criterion to all attributes but the labels
-load(file=paste(resultPath,"chapter2_result.RData",sep=""))
-col2Inspect = names(crowdSignalData)
-col2Inspect = subset(col2Inspect, (!grepl("label",col2Inspect)&col2Inspect!="time"))
-for (col in col2Inspect) {
-
- # Chauvenet criterion
- crowdSignalData = chauvenet(crowdSignalData, col)
- plotBinaryOutliers(crowdSignalData,col,paste(col,'_chauvenetOutlier',sep=""))
-
- # Set outliers to NA
- temp = crowdSignalData[,paste(col,'_chauvenetOutlier',sep="")]
- temp = ifelse(is.na(temp),FALSE,temp)
- crowdSignalData[temp,col] = NA
-}
-
-# Delete outlier columns
-crowdSignalData = crowdSignalData[,!grepl("_chauvenetOutlier",names(crowdSignalData))]
-
-# Save to file
-resultPath = './intermediate_datafiles/'
-save(crowdSignalData, file=paste(resultPath,"chapter3_result_outliers.RData",sep=""))
\ No newline at end of file
diff --git a/RCode/crowdsignals_ch3_rest.R b/RCode/crowdsignals_ch3_rest.R
deleted file mode 100644
index 772d19b5..00000000
--- a/RCode/crowdsignals_ch3_rest.R
+++ /dev/null
@@ -1,96 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# Chapter 3 - Handling Noise and Missing Values in Sensory Data
-# ./crowdsignals_ch3_rest.R
-#
-##########################################################################
-
-# Import libraries and source utils
-rm(list=ls())
-require(plotly)
-require(reshape2)
-require(zoo) # used for time series representation
-require(KFAS) # Kalman filtering and smoothing
-require(Rlof) # package for local outlier factor
-require(signal) # signal processing package (for Butterworth filter)
-source("./chapter2/visiualizeData.R")
-source("./chapter3/imputationMissingValues.R")
-source("./chapter3/kalmanFilter.R")
-source("./chapter3/lowPassFilter.R")
-source("./chapter3/principalComponent.R")
-
-# Read the result from the previous chapter, and make sture the index is of the type datetime
-resultPath = "./intermediate_datafiles/"
-load(file=paste(resultPath,"chapter2_result.RData",sep=""))
-chapter2Dataset = crowdSignalData
-
-load(file=paste(resultPath,"chapter3_result_outliers.RData",sep=""))
-
-# Compute the number of milliseconds covered by an instane based on the first two rows
-timeWindow = as.numeric(difftime(crowdSignalData$time[2],crowdSignalData$time[1],units = "secs"))*1000
-
-### Missing Values
-imputedMeanDataset = imputeMean(crowdSignalData, "hr_watch_rate")
-imputedMedianDataset = imputeMedian(crowdSignalData, "hr_watch_rate")
-imputedInterpolationDataset = imputeInterpolation(crowdSignalData, "hr_watch_rate")
-plotImputedValues(crowdSignalData, "hr_watch_rate", list(imputedMeanDataset["hr_watch_rate"],
- imputedMedianDataset["hr_watch_rate"], imputedInterpolationDataset["hr_watch_rate"]))
-rm(imputedMeanDataset,imputedMedianDataset, imputedInterpolationDataset)
-
-# Impute for all columns except for the label in the selected way (interpolation)
-col2Inspect = names(crowdSignalData)
-col2Inspect = subset(col2Inspect, (!grepl("label",col2Inspect)&col2Inspect!="time"))
-for (col in col2Inspect)
- crowdSignalData = imputeInterpolation(crowdSignalData, col)
-
-### Kalman filtering on the acc_phone_x attribute
-kalmanDataset = applyKalmanFilter(chapter2Dataset , "acc_phone_x")
-plot(kalmanDataset$time, kalmanDataset$acc_phone_x_kalman - kalmanDataset$acc_phone_x,type="l", col="blue", ylab = "Difference after Kalman Filter", xlab = "time")
-grid()
-
-rm(kalmanDataset) # do not use Kalman filtering
-
-### Low pass filtering: "reduce" signals from the data with more than 1.5 Hz
-
-fs = 1000/timeWindow # sampling frequency [Hz]
-cutoff = 1.5 # cut off [Hz]
-
-newDataset = lowPassFilter(crowdSignalData,"acc_phone_x",fs, cutoff, nOrder = 1)
-start = round(0.4*nrow(newDataset))
-end = round(0.43*nrow(newDataset))
-temp = plotDataset(newDataset[start:end,], c("acc_phone_x","acc_phone_x_lowpass"),c("exact","exact"))
-subplot(temp,nrows = length(temp),shareX = TRUE)
-
-rm(newDataset)
-
-# Apply lowpass filter to periodic measurements
-periodicMeasurements = c("acc_phone_x", "acc_phone_y", "acc_phone_z", "acc_watch_x", "acc_watch_y", "acc_watch_z", "gyr_phone_x", "gyr_phone_y",
- "gyr_phone_z", "gyr_watch_x", "gyr_watch_y", "gyr_watch_z", "mag_phone_x", "mag_phone_y", "mag_phone_z", "mag_watch_x",
- "mag_watch_y", "mag_watch_z")
-
-for (col in periodicMeasurements) {
- crowdSignalData = lowPassFilter(crowdSignalData, col, fs, cutoff, nOrder = 1)
- crowdSignalData[,col] = crowdSignalData[,paste(col,"_lowpass",sep="")]
-
-}
-crowdSignalData = crowdSignalData[,!grepl("_lowpass",colnames(crowdSignalData))]
-
-### Principal Component Analysis (PCA)
-col2Inspect = names(crowdSignalData)
-col2Inspect = subset(col2Inspect, (!grepl("label",col2Inspect) & col2Inspect!="time" &
- col2Inspect!="hr_watch_rate"))
-plot(pcaExplainedVariance(crowdSignalData,col2Inspect),type="b",col="blue",
- xlab = "Number of PC", ylab = "Explained variance")
-grid()
-
-crowdSignalData = applyPCA(crowdSignalData, col2Inspect, 7)
-temp = plotDataset(crowdSignalData, c("acc_", "gyr_", "hr_watch_rate", "light_phone_lux", "mag_", "press_phone_", "pca_", "label"),NULL)
-if (is.null(linePlot)) linePlot = subplot(temp,nrows = length(temp),shareX = TRUE) else
- linePlot = list(linePlot,subplot(temp,nrows = length(temp),shareX = TRUE))
-
-# Write output
-resultPath = "./intermediate_datafiles/"
-save(crowdSignalData, file=paste(resultPath,"chapter3_result_final.RData",sep=""))
diff --git a/RCode/crowdsignals_ch4.R b/RCode/crowdsignals_ch4.R
deleted file mode 100644
index 465c6cc9..00000000
--- a/RCode/crowdsignals_ch4.R
+++ /dev/null
@@ -1,80 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# Chapter 4 - Feature Engineering based on Sensory Data
-# ./crowdsignals_ch4.R
-#
-##########################################################################
-
-rm(list=ls())
-require(data.table)
-require(bit64)
-require(plotly)
-require(reshape2)
-require(zoo) # used for time series representation
-source("./Chapter2/createDataset.R")
-source("./Chapter2/visiualizeData.R")
-source("./chapter4/temporalAbstraction.R")
-source("./chapter4/frequencyAbstraction.R")
-
-
-### settings and data import
-resultPath = "./intermediate_datafiles/"
-load(file=paste(resultPath,"chapter3_result_final.RData",sep=""))
-
-# compute the number of milliseconds covered by an instane based on the first two rows
-timeWindow = as.numeric(difftime(crowdSignalData$time[2],crowdSignalData$time[1],units = "secs"))*1000
-
-# set different window sizes to the number of instances
-windowSizes = c(as.integer(5000/timeWindow), as.integer(30000/timeWindow), as.integer(300000/timeWindow))
-
-dataSetCopy = crowdSignalData
-for (ws in windowSizes) {
- dataSetCopy = abstractNumerical(dataSetCopy,"acc_phone_x",windowSize = ws,aggregationFunction = "mean")
- dataSetCopy = abstractNumerical(dataSetCopy,"acc_phone_x",windowSize = ws,aggregationFunction = "std")
-}
-
-temp = plotDataset(dataSetCopy, c("acc_phone_x", "acc_phone_x_temp_mean", "acc_phone_x_temp_std", "label"),c("exact", "like", "like", "like"))
-linePlot = subplot(temp,nrows = length(temp),shareX = TRUE)
-print(linePlot)
-
-# add new columns to our dataset
-ws = as.integer(30000/timeWindow)
-
-selectedPredictorCols = names(crowdSignalData)
-selectedPredictorCols = selectedPredictorCols[!grepl("label",selectedPredictorCols)&!grepl("time",selectedPredictorCols)]
-
-crowdSignalData = abstractNumerical(crowdSignalData,selectedPredictorCols,windowSize = ws,aggregationFunction = "mean")
-crowdSignalData = abstractNumerical(crowdSignalData,selectedPredictorCols,windowSize = ws,aggregationFunction = "std")
-
-
-# select frequent patterns
-# crowdSignalData = abstractCategorical(crowdSignalData, c("label"), c("like"), 0.03, as.integer(300000/timeWindow), 2)
-
-### features from the frequency domain
-fs = 1000/timeWindow
-windowSize = 10000/timeWindow
-
-selectedPredictorCols = c("acc_phone_x","acc_phone_y","acc_phone_z","acc_watch_x","acc_watch_y","acc_watch_z","gyr_phone_x","gyr_phone_y",
- "gyr_phone_z","gyr_watch_x","gyr_watch_y","gyr_watch_z","mag_phone_x","mag_phone_y","mag_phone_z",
- "mag_watch_x","mag_watch_y","mag_watch_z")
-
-# accept only a certain percentage of overlap in the windows, otherwise the training examples
-# will be too much alike.
-windowOverlap = 0.9
-skipPoints = max(1,as.integer((1-windowOverlap) * ws))
-instances2Analyze = seq(1,nrow(crowdSignalData),skipPoints)
-
-crowdSignalData = abstractFrequency(crowdSignalData, selectedPredictorCols, windowSize, fs, instances2Analyze)
-crowdSignalData = crowdSignalData[instances2Analyze,]
-
-# temp = plotDataset(dataSetCopy, c("acc_phone_x_max_freq", "acc_phone_x_freq_weighted", "acc_phone_x_pse", "labelOnTable"),c("like", "like", "like", "exact"))
-# linePlot = subplot(temp,nrows = length(temp),shareX = TRUE)
-# print(linePlot)
-
-
-# write output
-resultPath = "./intermediate_datafiles/"
-save(crowdSignalData, file=paste(resultPath,"chapter4_result.RData",sep=""))
diff --git a/RCode/crowdsignals_ch5.R b/RCode/crowdsignals_ch5.R
deleted file mode 100644
index 31cea0a6..00000000
--- a/RCode/crowdsignals_ch5.R
+++ /dev/null
@@ -1,99 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# Chapter 5 - Clustering
-# ./crowdsignals_ch5.R
-#
-##########################################################################
-
-rm(list=ls())
-require(cluster)
-require(scatterplot3d)
-require(ggplot2)
-
-### settings and data import
-resultPath = "./intermediate_datafiles/"
-load(file=paste(resultPath,"chapter4_result.RData",sep=""))
-
-# compute the number of milliseconds covered by an instane based on the first two rows
-timeWindow = as.numeric(difftime(crowdSignalData$time[2],crowdSignalData$time[1],units = "secs"))*1000
-
-### k-means clustering
-
-# check silhouette scores for k-means with different k
-kValues = 2:9
-silhouetteValues = c()
-
-tmpDat = crowdSignalData[,c('acc_phone_x', 'acc_phone_y', 'acc_phone_z')]
-for (k in kValues) {
- cat("k=", k,"\n")
- resKM = kmeans(tmpDat, centers = k,
- iter.max = 50, nstart = 50)
- dai = daisy(tmpDat)
- sil = silhouette(resKM$cluster,dai)
- silhouetteValues[k-kValues[1]+1] = summary(sil)$avg.width
-}
-plot(kValues,silhouetteValues,xlab="k",ylab="Silhouette Score (K-Means)",ylim = c(0,1),type="l",col="blue")
-grid()
-
-# run k-means with the highest silhouette score
-k = 6
-resKM = kmeans(tmpDat, centers = k,
- iter.max = 50, nstart = 50)
-sil = silhouette(resKM$cluster,dai)
-factoextra::fviz_silhouette(sil)
-
-# plot labels and clusters in 3D
-tmp =crowdSignalData[,colnames(crowdSignalData)[grepl("label",substr(colnames(crowdSignalData),1,10))]]
-tmpDat$pch = apply(tmp,1,max,na.rm=TRUE)
-with(tmpDat, {
- scatterplot3d(acc_phone_x, acc_phone_y, acc_phone_z,
- color= rainbow(k)[resKM$cluster], # color refers to cluster
- pch=pch, # marker refers to task label
- main="")
-})
-
-# clustering using k-medoids
-tmpDat = crowdSignalData[,c('acc_phone_x', 'acc_phone_y', 'acc_phone_z')]
-kValues = 2:9
-silhouetteValues = c()
-
-for (k in kValues) {
- cat("k=", k,"\n")
- resKMed = pam(tmpDat, k)
- dai = daisy(tmpDat)
- sil = silhouette(resKMed$cluster,dai)
- silhouetteValues[k-kValues[1]+1] = summary(sil)$avg.width
-}
-plot(kValues,silhouetteValues,xlab="k",ylab="Silhouette Score (K-Medoids)",ylim = c(0,1),type="l",col="blue")
-grid()
-
-# run k-medoids with the highest silhouette score
-k = 6
-resKMed = pam(tmpDat, k)
-sil = silhouette(resKMed$cluster,dai)
-factoextra::fviz_silhouette(sil)
-
-# demonstrate that the k-means and k-medoids lead to very similar clusters
-table(resKM$cluster,resKMed$clustering) # cluster numbers are different, but otherwise no big difference
-
-### hierarchical clustering (agglomerative)
-cluster = hclust(dist(tmpDat[,-4]),method = "ward.D")
-kMax = 6
-memb = cutree(cluster, k = kMax)
-sil = silhouette(memb,dai)
-factoextra::fviz_silhouette(sil)
-
-cent = NULL
-for(k in 1:kMax){
- cent <- rbind(cent, colMeans(tmpDat[,-4][memb == k, , drop = FALSE]))
-}
-hc1 <- hclust(dist(cent)^2, method = "average", members = table(memb))
-plot(hc1)
-
-# write output
-crowdSignalData$cluster = resKM$cluster
-resultPath = "./intermediate_datafiles/"
-save(crowdSignalData, file=paste(resultPath,"chapter5_result.RData",sep=""))
diff --git a/RCode/crowdsignals_ch7_classification.R b/RCode/crowdsignals_ch7_classification.R
deleted file mode 100644
index 217f2815..00000000
--- a/RCode/crowdsignals_ch7_classification.R
+++ /dev/null
@@ -1,103 +0,0 @@
-##########################################################################
-#
-# Mark Hoogendoorn & Burkhardt Funk (2017)
-# Machine Learning for the Quantified Self
-# Springer
-# Chapter 7 - Classification
-# ./crowdsignals_ch7_classification.R
-#
-##########################################################################
-
-rm(list=ls())
-require(mlr) # used for elaborated mechanisms of feature selection
-source("./chapter7/prepareDatasetForLearning.R")
-
-
-### settings and data import
-resultPath = "./intermediate_datafiles/"
-load(file=paste(resultPath,"chapter5_result.RData",sep=""))
-
-# We create a single column with the categorical attribute representing our class. Furthermore, we use 70% of our data
-# for training and the remaining 30% as an independent test set. We select the sets based on stratified sampling. We remove
-# cases where we do not know the label.
-res = splitSingleDatasetClassification(df=crowdSignalData,classLabels = "label", matching = "like", trainingFrac = 0.7)
-trainX = res$trainingSetX
-trainY = res$trainingSety
-testX = res$testSetX
-testY = res$testSety
-
-
-print(paste("Training set length is: ", nrow(trainX)))
-print(paste("Test set length is: ", nrow(testX)))
-
-# select subsets of the features that we will consider:
-basicFeatures = c("acc_phone_x","acc_phone_y","acc_phone_z","acc_watch_x","acc_watch_y","acc_watch_z","gyr_phone_x","gyr_phone_y","gyr_phone_z","gyr_watch_x","gyr_watch_y","gyr_watch_z",
- "hr_watch_rate", "light_phone_lux","mag_phone_x","mag_phone_y","mag_phone_z","mag_watch_x","mag_watch_y","mag_watch_z","press_phone_pressure")
-pcaFeatures = c("pca_1","pca_2","pca_3","pca_4","pca_5","pca_6","pca_7")
-timeFeatures = names(crowdSignalData)[grepl("_temp_",names(crowdSignalData))]
-freqFeatures = names(crowdSignalData)[grepl("_freq",names(crowdSignalData))]
-clusterFeatures = "cluster"
-
-print(paste("#basic features: ", length(basicFeatures)))
-print(paste("#PCA features: ", length(pcaFeatures)))
-print(paste("#time features: ", length(timeFeatures)))
-print(paste("#frequency features: ", length(freqFeatures)))
-print(paste("#cluster features: ", length(clusterFeatures)))
-
-featuresUptoChapter3 = c(basicFeatures,pcaFeatures)
-featuresUptoChapter4 = c(featuresUptoChapter3, timeFeatures, freqFeatures)
-featuresUptoChapter5 = c(featuresUptoChapter4, clusterFeatures)
-
-### determine relevant features by forward selection on a tree learner
-# we use the mlr/mbench packages in R to do forward selection elegantly
-# source: https://mlr-org.github.io/mlr-tutorial/release/html/feature_selection/index.html
-trainingSet = trainX[,featuresUptoChapter5]
-trainingSet$class = as.factor(trainY)
-
-# generate classification task
-classTask = makeClassifTask(data = trainingSet, target="class")
-
-# specify search strategy (forward selection)
-ctrl = makeFeatSelControlSequential(method = "sfs", alpha =0, max.features = 50)
-rdesc = makeResampleDesc("CV", iters = 2)
-selfeats = selectFeatures(learner = "classif.rpart", task = classTask, resampling = rdesc, control = ctrl,
- measure = acc, show.info = TRUE)
-print(paste("Selected features",selfeats$x))
-
-
-### explore regularization (use glmnet)
-trainingSet = trainX[,featuresUptoChapter4]#[,selectedFeatures]
-trainingSet$class = as.factor(trainY)
-testSet = testX[,featuresUptoChapter4]#[,selectedFeatures]
-testSet$class = as.factor(testY)
-
-classTask = makeClassifTask(data = trainingSet, target="class")
-
-#learner = makeLearner("classif.mlp", size = 250, maxit = 500)#, decay = 1, size = 50, maxit = 1000)
-
-regParameters = c( 0.0001, 0.001, 0.01, 0.1, 1, 10)
-performanceTraining = c()
-performanceTest = c()
-k = 0
-
-for (lambda in regParameters) {
- k = k + 1
- learner = makeLearner("classif.glmnet", s = lambda)
- model = train(learner, classTask)
- pred = predict(model, task = classTask)
- performanceTraining = c(performanceTraining,
- performance(pred, measures = list(acc)))
-
- pred = predict(model, makeClassifTask(data=testSet,target="class"))
- performanceTest = c(performanceTest,
- performance(pred, measures = list(acc)))
-}
-
-plot(regParameters,performanceTraining,log = "x",ylim=c(0,1.05),type="l",ylab="Accuracy")
-lines(regParameters,performanceTest,lty=2,col="red")
-grid()
-
-# selected features (see python code)
-selectedFeatures = c("acc_phone_y_freq_0_Hz_ws_40", "press_phone_pressure_temp_mean_ws_120", "gyr_phone_x_temp_std_ws_120",
- "mag_watch_y_pse", "mag_phone_z_max_freq", "gyr_watch_y_freq_weighted", "gyr_phone_y_freq_1_Hz_ws_40",
- "acc_phone_x_freq_1.9_Hz_ws_40", "mag_watch_z_freq_0.9_Hz_ws_40", "acc_watch_y_freq_0.5_Hz_ws_40")
diff --git a/README.md b/README.md
deleted file mode 100644
index c57c0bb3..00000000
--- a/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# ML4QS
-
-This repository provides all the code associated with the book titled "Machine Learning for the Quantified Self", authored by Mark Hoogendoorn and Burkhardt Funk and published by Springer in 2018. The website of the book can be found on ml4qs.org. Both R code, Python 2 code (used to generate the results in the book) and Python3 code can be found (due to updated packages, results might differ a bit compared to those reported in the book). For the Python3 code, both a Docker setup and a requirements file are available, see the README.md in the Python3 directory.
-
-Note that we have tried to make the code as robust as we can, but we cannot provide any guarantees on its correctness. This code is made available under the GNU public license. We have used snippets of code from other sources and have tried to add references to these in our code where possible. When using the code for publications, please include a reference to the book in your paper:
-
-Hoogendoorn, M. and Funk, B., Machine Learning for the Quantified Self - On the Art of Learning from Sensory Data, Springer, 2018.
-
diff --git a/Target_transformation/target_transformation.ipynb b/Target_transformation/target_transformation.ipynb
new file mode 100644
index 00000000..7509d3b7
--- /dev/null
+++ b/Target_transformation/target_transformation.ipynb
@@ -0,0 +1,870 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start | \n",
+ " steps | \n",
+ " temp_celsius | \n",
+ " rain | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 0.000000 | \n",
+ " 142.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-01 00:10:00 | \n",
+ " 97.054054 | \n",
+ " 142.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-01 00:20:00 | \n",
+ " 94.945946 | \n",
+ " 142.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-01 00:30:00 | \n",
+ " 0.000000 | \n",
+ " 142.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-01 00:40:00 | \n",
+ " 46.876667 | \n",
+ " 142.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 22460 | \n",
+ " 2023-06-05 23:20:00 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 22461 | \n",
+ " 2023-06-05 23:30:00 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 22462 | \n",
+ " 2023-06-05 23:40:00 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 22463 | \n",
+ " 2023-06-05 23:50:00 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 22464 | \n",
+ " 2023-06-06 00:00:00 | \n",
+ " 0.000000 | \n",
+ " 95.0 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
22465 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start steps temp_celsius rain\n",
+ "0 2023-01-01 00:00:00 0.000000 142.0 0.0\n",
+ "1 2023-01-01 00:10:00 97.054054 142.0 0.0\n",
+ "2 2023-01-01 00:20:00 94.945946 142.0 0.0\n",
+ "3 2023-01-01 00:30:00 0.000000 142.0 0.0\n",
+ "4 2023-01-01 00:40:00 46.876667 142.0 0.0\n",
+ "... ... ... ... ...\n",
+ "22460 2023-06-05 23:20:00 0.000000 NaN NaN\n",
+ "22461 2023-06-05 23:30:00 0.000000 NaN NaN\n",
+ "22462 2023-06-05 23:40:00 0.000000 NaN NaN\n",
+ "22463 2023-06-05 23:50:00 0.000000 NaN NaN\n",
+ "22464 2023-06-06 00:00:00 0.000000 95.0 NaN\n",
+ "\n",
+ "[22465 rows x 4 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "New_weather_steps = pd.read_csv(\"New_weather_steps.csv\")\n",
+ "New_weather_steps = New_weather_steps[['start', 'steps', 'temp_celsius', 'rain']]\n",
+ "display(New_weather_steps)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\4240309728.py:7: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"temp_celsius\"][i] = \"Chilly\"\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start | \n",
+ " steps | \n",
+ " temp_celsius | \n",
+ " rain | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 0.000000 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-01 00:10:00 | \n",
+ " 97.054054 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-01 00:20:00 | \n",
+ " 94.945946 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-01 00:30:00 | \n",
+ " 0.000000 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-01 00:40:00 | \n",
+ " 46.876667 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 22460 | \n",
+ " 2023-06-05 23:20:00 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 22461 | \n",
+ " 2023-06-05 23:30:00 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 22462 | \n",
+ " 2023-06-05 23:40:00 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 22463 | \n",
+ " 2023-06-05 23:50:00 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 22464 | \n",
+ " 2023-06-06 00:00:00 | \n",
+ " 0.000000 | \n",
+ " Cold | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
22465 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start steps temp_celsius rain\n",
+ "0 2023-01-01 00:00:00 0.000000 Chilly 0.0\n",
+ "1 2023-01-01 00:10:00 97.054054 Chilly 0.0\n",
+ "2 2023-01-01 00:20:00 94.945946 Chilly 0.0\n",
+ "3 2023-01-01 00:30:00 0.000000 Chilly 0.0\n",
+ "4 2023-01-01 00:40:00 46.876667 Chilly 0.0\n",
+ "... ... ... ... ...\n",
+ "22460 2023-06-05 23:20:00 0.000000 NaN NaN\n",
+ "22461 2023-06-05 23:30:00 0.000000 NaN NaN\n",
+ "22462 2023-06-05 23:40:00 0.000000 NaN NaN\n",
+ "22463 2023-06-05 23:50:00 0.000000 NaN NaN\n",
+ "22464 2023-06-06 00:00:00 0.000000 Cold NaN\n",
+ "\n",
+ "[22465 rows x 4 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "for i in range(len(New_weather_steps)):\n",
+ " if New_weather_steps[\"temp_celsius\"][i] < (0):\n",
+ " New_weather_steps[\"temp_celsius\"][i] = \"Freezing\"\n",
+ " elif New_weather_steps[\"temp_celsius\"][i] < (10*10):\n",
+ " New_weather_steps[\"temp_celsius\"][i] = \"Cold\"\n",
+ " elif New_weather_steps[\"temp_celsius\"][i] < (15*10):\n",
+ " New_weather_steps[\"temp_celsius\"][i] = \"Chilly\"\n",
+ " elif New_weather_steps[\"temp_celsius\"][i] < (20*10):\n",
+ " New_weather_steps[\"temp_celsius\"][i] = \"Comfortable\"\n",
+ " elif New_weather_steps[\"temp_celsius\"][i] < (25*10):\n",
+ " New_weather_steps[\"temp_celsius\"][i] = \"Warm\"\n",
+ " elif New_weather_steps[\"temp_celsius\"][i] >= (25*10):\n",
+ " New_weather_steps[\"temp_celsius\"][i] = \"Hot\"\n",
+ "display(New_weather_steps)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:12: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 5\n",
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:14: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 6\n",
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:8: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 3\n",
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:10: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 4\n",
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:4: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 1\n",
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:6: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 2\n",
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:16: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 7\n",
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:18: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 8\n",
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:20: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 9\n",
+ "C:\\Users\\irene\\AppData\\Local\\Temp\\ipykernel_25532\\475042030.py:24: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " New_weather_steps[\"combined\"][j] = 11\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start | \n",
+ " steps | \n",
+ " temp_celsius | \n",
+ " rain | \n",
+ " combined | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 0.000000 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-01 00:10:00 | \n",
+ " 97.054054 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-01 00:20:00 | \n",
+ " 94.945946 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-01 00:30:00 | \n",
+ " 0.000000 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-01 00:40:00 | \n",
+ " 46.876667 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " 2023-01-01 00:50:00 | \n",
+ " 126.326232 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " 2023-01-01 01:00:00 | \n",
+ " 128.406776 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " 2023-01-01 01:10:00 | \n",
+ " 26.821821 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " 2023-01-01 01:20:00 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " 2023-01-01 01:30:00 | \n",
+ " 94.514644 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " 2023-01-01 01:40:00 | \n",
+ " 83.035725 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 11 | \n",
+ " 2023-01-01 01:50:00 | \n",
+ " 1.945755 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 12 | \n",
+ " 2023-01-01 02:00:00 | \n",
+ " 0.000000 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 13 | \n",
+ " 2023-01-01 02:10:00 | \n",
+ " 117.962963 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 14 | \n",
+ " 2023-01-01 02:20:00 | \n",
+ " 134.037037 | \n",
+ " Chilly | \n",
+ " 0.0 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start steps temp_celsius rain combined\n",
+ "0 2023-01-01 00:00:00 0.000000 Chilly 0.0 5.0\n",
+ "1 2023-01-01 00:10:00 97.054054 Chilly 0.0 5.0\n",
+ "2 2023-01-01 00:20:00 94.945946 Chilly 0.0 5.0\n",
+ "3 2023-01-01 00:30:00 0.000000 Chilly 0.0 5.0\n",
+ "4 2023-01-01 00:40:00 46.876667 Chilly 0.0 5.0\n",
+ "5 2023-01-01 00:50:00 126.326232 Chilly 0.0 5.0\n",
+ "6 2023-01-01 01:00:00 128.406776 NaN NaN NaN\n",
+ "7 2023-01-01 01:10:00 26.821821 NaN NaN NaN\n",
+ "8 2023-01-01 01:20:00 0.000000 NaN NaN NaN\n",
+ "9 2023-01-01 01:30:00 94.514644 NaN NaN NaN\n",
+ "10 2023-01-01 01:40:00 83.035725 NaN NaN NaN\n",
+ "11 2023-01-01 01:50:00 1.945755 NaN NaN NaN\n",
+ "12 2023-01-01 02:00:00 0.000000 Chilly 0.0 5.0\n",
+ "13 2023-01-01 02:10:00 117.962963 Chilly 0.0 5.0\n",
+ "14 2023-01-01 02:20:00 134.037037 Chilly 0.0 5.0"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "New_weather_steps[\"combined\"] = np.nan\n",
+ "for j in range(len(New_weather_steps)):\n",
+ " if New_weather_steps[\"temp_celsius\"][j] == \"Freezing\" and New_weather_steps[\"rain\"][j] == 0:\n",
+ " New_weather_steps[\"combined\"][j] = 1\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Freezing\" and New_weather_steps[\"rain\"][j] == 1:\n",
+ " New_weather_steps[\"combined\"][j] = 2\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Cold\" and New_weather_steps[\"rain\"][j] == 0:\n",
+ " New_weather_steps[\"combined\"][j] = 3\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Cold\" and New_weather_steps[\"rain\"][j] == 1:\n",
+ " New_weather_steps[\"combined\"][j] = 4\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Chilly\" and New_weather_steps[\"rain\"][j] == 0:\n",
+ " New_weather_steps[\"combined\"][j] = 5\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Chilly\" and New_weather_steps[\"rain\"][j] == 1:\n",
+ " New_weather_steps[\"combined\"][j] = 6\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Comfortable\" and New_weather_steps[\"rain\"][j] == 0:\n",
+ " New_weather_steps[\"combined\"][j] = 7\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Comfortable\" and New_weather_steps[\"rain\"][j] == 1:\n",
+ " New_weather_steps[\"combined\"][j] = 8\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Warm\" and New_weather_steps[\"rain\"][j] == 0:\n",
+ " New_weather_steps[\"combined\"][j] = 9\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Warm\" and New_weather_steps[\"rain\"][j] == 1:\n",
+ " New_weather_steps[\"combined\"][j] = 10\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Hot\" and New_weather_steps[\"rain\"][j] == 0:\n",
+ " New_weather_steps[\"combined\"][j] = 11\n",
+ " elif New_weather_steps[\"temp_celsius\"][j] == \"Hot\" and New_weather_steps[\"rain\"][j] == 1:\n",
+ " New_weather_steps[\"combined\"][j] = 12\n",
+ "display(New_weather_steps.iloc[:15])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " start | \n",
+ " steps | \n",
+ " combined | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 2023-01-01 00:00:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 2023-01-01 00:10:00 | \n",
+ " 97.054054 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 2023-01-01 00:20:00 | \n",
+ " 94.945946 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 2023-01-01 00:30:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 2023-01-01 00:40:00 | \n",
+ " 46.876667 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 22447 | \n",
+ " 2023-06-05 21:10:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 22448 | \n",
+ " 2023-06-05 21:20:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 22449 | \n",
+ " 2023-06-05 21:30:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 22450 | \n",
+ " 2023-06-05 21:40:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " | 22451 | \n",
+ " 2023-06-05 21:50:00 | \n",
+ " 0.000000 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
11652 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " start steps combined\n",
+ "0 2023-01-01 00:00:00 0.000000 5.0\n",
+ "1 2023-01-01 00:10:00 97.054054 5.0\n",
+ "2 2023-01-01 00:20:00 94.945946 5.0\n",
+ "3 2023-01-01 00:30:00 0.000000 5.0\n",
+ "4 2023-01-01 00:40:00 46.876667 5.0\n",
+ "... ... ... ...\n",
+ "22447 2023-06-05 21:10:00 0.000000 5.0\n",
+ "22448 2023-06-05 21:20:00 0.000000 5.0\n",
+ "22449 2023-06-05 21:30:00 0.000000 5.0\n",
+ "22450 2023-06-05 21:40:00 0.000000 5.0\n",
+ "22451 2023-06-05 21:50:00 0.000000 5.0\n",
+ "\n",
+ "[11652 rows x 3 columns]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "New_weather_steps = New_weather_steps[New_weather_steps['combined'].notna()]\n",
+ "New_weather_steps = New_weather_steps.drop(columns=['temp_celsius', 'rain'])\n",
+ "display(New_weather_steps)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "New_weather_steps.to_csv(\"Transformed_weather_steps.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | col_0 | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " | combined | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1.0 | \n",
+ " 510 | \n",
+ "
\n",
+ " \n",
+ " | 2.0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " | 3.0 | \n",
+ " 5460 | \n",
+ "
\n",
+ " \n",
+ " | 4.0 | \n",
+ " 1698 | \n",
+ "
\n",
+ " \n",
+ " | 5.0 | \n",
+ " 2370 | \n",
+ "
\n",
+ " \n",
+ " | 6.0 | \n",
+ " 672 | \n",
+ "
\n",
+ " \n",
+ " | 7.0 | \n",
+ " 726 | \n",
+ "
\n",
+ " \n",
+ " | 8.0 | \n",
+ " 24 | \n",
+ "
\n",
+ " \n",
+ " | 9.0 | \n",
+ " 180 | \n",
+ "
\n",
+ " \n",
+ " | 11.0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "col_0 count\n",
+ "combined \n",
+ "1.0 510\n",
+ "2.0 6\n",
+ "3.0 5460\n",
+ "4.0 1698\n",
+ "5.0 2370\n",
+ "6.0 672\n",
+ "7.0 726\n",
+ "8.0 24\n",
+ "9.0 180\n",
+ "11.0 6"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#New_weather_steps[\"combined\"].value_counts(sort = True)\n",
+ "pd.crosstab(index=New_weather_steps['combined'], columns='count')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Correlation between steps and combined is: -0.04\n"
+ ]
+ }
+ ],
+ "source": [
+ "col1, col2 = \"steps\", \"combined\"\n",
+ "corr = New_weather_steps[col1].corr(New_weather_steps[col2])\n",
+ "print (\"Correlation between \", col1, \" and \", col2, \"is: \", round(corr, 2))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.13"
+ },
+ "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Target_transformation/target_transformation.py b/Target_transformation/target_transformation.py
new file mode 100644
index 00000000..686a9c18
--- /dev/null
+++ b/Target_transformation/target_transformation.py
@@ -0,0 +1,56 @@
+import pandas as pd
+import numpy as np
+
+
+#load dataset
+New_weather_steps = pd.read_csv("New_weather_steps.csv")
+New_weather_steps = New_weather_steps[['start', 'steps', 'temp_celsius', 'rain']]
+
+#categorize weather
+for i in range(len(New_weather_steps)):
+ if New_weather_steps["temp_celsius"][i] < (0):
+ New_weather_steps["temp_celsius"][i] = "Freezing"
+ elif New_weather_steps["temp_celsius"][i] < (10*10):
+ New_weather_steps["temp_celsius"][i] = "Cold"
+ elif New_weather_steps["temp_celsius"][i] < (15*10):
+ New_weather_steps["temp_celsius"][i] = "Chilly"
+ elif New_weather_steps["temp_celsius"][i] < (20*10):
+ New_weather_steps["temp_celsius"][i] = "Comfortable"
+ elif New_weather_steps["temp_celsius"][i] < (25*10):
+ New_weather_steps["temp_celsius"][i] = "Warm"
+ elif New_weather_steps["temp_celsius"][i] >= (25*10):
+ New_weather_steps["temp_celsius"][i] = "Hot"
+
+#create combined column
+New_weather_steps["combined"] = np.nan
+for j in range(len(New_weather_steps)):
+ if New_weather_steps["temp_celsius"][j] == "Freezing" and New_weather_steps["rain"][j] == 0:
+ New_weather_steps["combined"][j] = 1
+ elif New_weather_steps["temp_celsius"][j] == "Freezing" and New_weather_steps["rain"][j] == 1:
+ New_weather_steps["combined"][j] = 2
+ elif New_weather_steps["temp_celsius"][j] == "Cold" and New_weather_steps["rain"][j] == 0:
+ New_weather_steps["combined"][j] = 3
+ elif New_weather_steps["temp_celsius"][j] == "Cold" and New_weather_steps["rain"][j] == 1:
+ New_weather_steps["combined"][j] = 4
+ elif New_weather_steps["temp_celsius"][j] == "Chilly" and New_weather_steps["rain"][j] == 0:
+ New_weather_steps["combined"][j] = 5
+ elif New_weather_steps["temp_celsius"][j] == "Chilly" and New_weather_steps["rain"][j] == 1:
+ New_weather_steps["combined"][j] = 6
+ elif New_weather_steps["temp_celsius"][j] == "Comfortable" and New_weather_steps["rain"][j] == 0:
+ New_weather_steps["combined"][j] = 7
+ elif New_weather_steps["temp_celsius"][j] == "Comfortable" and New_weather_steps["rain"][j] == 1:
+ New_weather_steps["combined"][j] = 8
+ elif New_weather_steps["temp_celsius"][j] == "Warm" and New_weather_steps["rain"][j] == 0:
+ New_weather_steps["combined"][j] = 9
+ elif New_weather_steps["temp_celsius"][j] == "Warm" and New_weather_steps["rain"][j] == 1:
+ New_weather_steps["combined"][j] = 10
+ elif New_weather_steps["temp_celsius"][j] == "Hot" and New_weather_steps["rain"][j] == 0:
+ New_weather_steps["combined"][j] = 11
+ elif New_weather_steps["temp_celsius"][j] == "Hot" and New_weather_steps["rain"][j] == 1:
+ New_weather_steps["combined"][j] = 12
+
+#remove na, and columns
+New_weather_steps = New_weather_steps[New_weather_steps['combined'].notna()]
+New_weather_steps = New_weather_steps.drop(columns=['temp_celsius', 'rain'])
+
+New_weather_steps.to_csv("Transformed_weather_steps.csv")
\ No newline at end of file
diff --git a/combine_data_.py b/combine_data_.py
new file mode 100644
index 00000000..c980974c
--- /dev/null
+++ b/combine_data_.py
@@ -0,0 +1,156 @@
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+
+def fix_weather_data(dir: str, rows_skipped=30):
+ print('fixing weather data')
+ weather_raw = pd.read_csv(dir, dtype=str, skiprows=rows_skipped)
+
+ # replace confusing names with better names derived from .txt file
+ weather = weather_raw.rename(columns={' HH': 'hour',
+ ' DD': 'direction_wind',
+ ' FH': 'windspeed_avg_hour',
+ ' FF': 'windspeed_avg_10min',
+ ' FX': 'max_wind_gust',
+ ' T': 'temp_celsius',
+ ' T10N': 'temp_min_6h',
+ ' TD': 'temp_dewpoint',
+ ' SQ': 'sunshine_duration',
+ ' Q': 'glob_radiation',
+ ' DR': 'precipitation_duration',
+ ' RH': 'precipitation_amount_hourly',
+ ' P': 'air_pressure',
+ ' VV': 'horizontal_visibility',
+ ' N': 'cloud_cover',
+ ' U': 'relative_humidity',
+ ' WW': 'weather_code',
+ ' IX': 'indicator_present_weather_code',
+ ' M': 'fog',
+ ' R': 'rain',
+ ' S': 'snow',
+ ' O': 'thunder',
+ ' Y': 'ice_formation'}, inplace=False)
+
+ # remove whitespace from values
+ for i in weather.columns:
+ weather[i] = weather[i].str.strip()
+
+ weather = weather.replace('', np.nan, regex=True)
+
+ # change datatype
+ for i in weather.columns:
+ if i == 'precipitation_amount_hourly':
+ weather[i] = weather[i].astype(float)
+ else:
+ weather[i] = weather[i].astype(float).astype("Int32")
+
+ # fix datetimes
+ weather['hour'] = pd.to_datetime(weather['hour'] - 1, format='%H', exact=False).dt.strftime(
+ '%H:%M:%S') # hour minus one because somehow 24:00:00 gets converted to 02:00:00, needs to be fixed
+ weather['YYYYMMDD'] = pd.to_datetime(weather['YYYYMMDD'], format='%Y%m%d').dt.strftime('%Y-%m-%d')
+ weather['datetime'] = weather[['YYYYMMDD', 'hour']].apply(lambda x: ' '.join(x.values.astype(str)), axis="columns")
+ weather['datetime'] = pd.to_datetime(pd.to_datetime(weather['datetime']).dt.strftime('%Y-%m-%d %H:%M:%S'))
+ weather['start_weather'] = weather['datetime']
+ weather['end_weather'] = pd.to_datetime(weather['start_weather']) + pd.Timedelta('1H')
+ cols = list(weather)
+ cols.insert(0, cols.pop(cols.index('end_weather')))
+ cols.insert(0, cols.pop(cols.index('start_weather')))
+ cols.pop(cols.index('hour'))
+ cols.pop(cols.index('YYYYMMDD'))
+ weather = weather.loc[:, cols]
+ return weather
+
+
+def merge_steps_weather(dir_weather: str, dir_steps: str, steps_column_datetime='startDate'):
+ steps = pd.read_csv(dir_steps)
+ weather = fix_weather_data(dir_weather)
+ # decide on column to merge by, change steps_column_datetime to choose another datetime column
+ weather.rename(columns={'datetime': steps_column_datetime}, inplace=True)
+
+ steps[steps_column_datetime] = pd.to_datetime(
+ pd.to_datetime(steps[steps_column_datetime]).dt.strftime('%Y-%m-%d %H:%M:%S'))
+ steps = steps[steps['startDate'].dt.strftime('%Y') == '2023']
+ steps.sort_values(steps_column_datetime, inplace=True)
+ weather.sort_values(steps_column_datetime, inplace=True)
+
+ merge = pd.merge_asof(steps, weather, on=steps_column_datetime, tolerance=pd.Timedelta("60m"))
+
+ return merge
+
+
+def construct_time_intervals(df, Tdelta='15min'):
+ dat_int = df.resample(Tdelta, on='startDate', convention='end').agg({
+ 'value': 'sum',
+ 'windspeed_avg_hour': 'mean',
+ 'temp_celsius': 'mean',
+ 'sunshine_duration': 'mean',
+ 'precipitation_duration': 'mean',
+ 'fog': 'mean',
+ 'rain': 'mean',
+ 'snow': 'mean',
+ 'thunder': 'mean',
+ 'ice_formation': 'mean'
+ }).reset_index()
+
+ dat_int['datetime'] = dat_int['datetime'] + pd.Timedelta('15min')
+ dat_int.rename({'startDate': 'datetime'}, inplace=True)
+ return dat_int
+
+
+def fix_steps_time_intervals(dir, start_date, end_date, delta_t):
+ print('fixing steps data')
+ steps = pd.read_csv(dir)
+ steps = steps[pd.to_datetime(steps['startDate']).dt.strftime('%Y-%m') == pd.to_datetime(start_date).strftime('%Y-%m')].reset_index(drop=True)
+ steps_start = pd.to_datetime(pd.to_datetime(steps['startDate']).dt.strftime('%Y-%m-%d %H:%M:%S'))
+ steps_end = pd.to_datetime(pd.to_datetime(steps['endDate']).dt.strftime('%Y-%m-%d %H:%M:%S'))
+
+ range_start = pd.to_datetime(pd.date_range(start=start_date,
+ end=end_date,
+ freq='S'))
+ range_end = pd.to_datetime(pd.date_range(start=pd.to_datetime(start_date) + pd.Timedelta(seconds=1),
+ end=pd.to_datetime(end_date) + pd.Timedelta(seconds=1),
+ freq='S'))
+ new_steps = pd.DataFrame({'start': range_start,
+ 'end': range_end,
+ 'steps': [0] * len(range_start)})
+
+ for i in tqdm(range(len(steps_start))):
+ mask = (new_steps['start'] >= steps_start[i]) & (new_steps['end'] <= steps_end[i])
+ trues = len(mask[mask == True])
+ if trues != 0:
+ res = steps.loc[i, 'value'] / trues
+ new_steps.loc[mask, 'steps'] = res
+ new_steps = new_steps.resample(delta_t, on='end').steps.sum().reset_index().rename(columns = {'end': 'start'})
+ new_steps['end'] = new_steps['start'] + pd.Timedelta(delta_t)
+
+ new_steps = new_steps[['start','end','steps']]
+
+ return new_steps
+
+def merge_steps_weather(dir_weather: str, dir_steps: str, start_date, end_date, delta_t, steps_column_datetime='start'):
+ steps = fix_steps_time_intervals(dir_steps, start_date, end_date, delta_t)
+ weather = fix_weather_data(dir_weather)
+
+ print('merging weather and steps data')
+
+ # decide on column to merge by, change steps_column_datetime to choose another datetime column
+ weather.rename(columns={'datetime': steps_column_datetime}, inplace=True)
+
+ steps[steps_column_datetime] = pd.to_datetime(
+ pd.to_datetime(steps[steps_column_datetime]).dt.strftime('%Y-%m-%d %H:%M:%S'))
+ steps.sort_values(steps_column_datetime, inplace=True)
+ weather.sort_values(steps_column_datetime, inplace=True)
+
+ merge = pd.merge_asof(steps, weather, on=steps_column_datetime, tolerance=pd.Timedelta("60m"))
+
+ return merge
+
+res = merge_steps_weather(dir_weather='C:\\Users\\irene\\OneDrive\\Bureaublad\\ML\\ML4QS\\data_used\\weather.txt',
+ dir_steps='C:\\Users\\irene\\OneDrive\\Bureaublad\\ML\\ML4QS\\data_used\\StepCount.csv',
+ start_date='2023-01-01 00:00:00',
+ end_date='2023-06-06 00:00:00',
+ delta_t='10min')
+
+res.to_csv("New_weather_steps.csv")
+
+