From 88ca99094ee3afccdca706406d4d82ebcfaef7aa Mon Sep 17 00:00:00 2001 From: wangyis <40844333+wangyis@users.noreply.github.com> Date: Sun, 6 Dec 2020 20:18:24 -0500 Subject: [PATCH 1/2] submission --- .../sarimax_explanation/AirPassengers.csv | 1 + resources/sarimax_explanation/catfish.csv | 325 +++++++++ .../sarimax_explanation/retail_sales.csv | 73 ++ sarimax_explanation.Rmd | 653 ++++++++++++++++++ 4 files changed, 1052 insertions(+) create mode 100644 resources/sarimax_explanation/AirPassengers.csv create mode 100644 resources/sarimax_explanation/catfish.csv create mode 100644 resources/sarimax_explanation/retail_sales.csv create mode 100644 sarimax_explanation.Rmd diff --git a/resources/sarimax_explanation/AirPassengers.csv b/resources/sarimax_explanation/AirPassengers.csv new file mode 100644 index 0000000..3a127e2 --- /dev/null +++ b/resources/sarimax_explanation/AirPassengers.csv @@ -0,0 +1 @@ +Month,#Passengers 1949-01,112 1949-02,118 1949-03,132 1949-04,129 1949-05,121 1949-06,135 1949-07,148 1949-08,148 1949-09,136 1949-10,119 1949-11,104 1949-12,118 1950-01,115 1950-02,126 1950-03,141 1950-04,135 1950-05,125 1950-06,149 1950-07,170 1950-08,170 1950-09,158 1950-10,133 1950-11,114 1950-12,140 1951-01,145 1951-02,150 1951-03,178 1951-04,163 1951-05,172 1951-06,178 1951-07,199 1951-08,199 1951-09,184 1951-10,162 1951-11,146 1951-12,166 1952-01,171 1952-02,180 1952-03,193 1952-04,181 1952-05,183 1952-06,218 1952-07,230 1952-08,242 1952-09,209 1952-10,191 1952-11,172 1952-12,194 1953-01,196 1953-02,196 1953-03,236 1953-04,235 1953-05,229 1953-06,243 1953-07,264 1953-08,272 1953-09,237 1953-10,211 1953-11,180 1953-12,201 1954-01,204 1954-02,188 1954-03,235 1954-04,227 1954-05,234 1954-06,264 1954-07,302 1954-08,293 1954-09,259 1954-10,229 1954-11,203 1954-12,229 1955-01,242 1955-02,233 1955-03,267 1955-04,269 1955-05,270 1955-06,315 1955-07,364 1955-08,347 1955-09,312 1955-10,274 1955-11,237 1955-12,278 1956-01,284 1956-02,277 1956-03,317 1956-04,313 1956-05,318 1956-06,374 1956-07,413 1956-08,405 1956-09,355 1956-10,306 1956-11,271 1956-12,306 1957-01,315 1957-02,301 1957-03,356 1957-04,348 1957-05,355 1957-06,422 1957-07,465 1957-08,467 1957-09,404 1957-10,347 1957-11,305 1957-12,336 1958-01,340 1958-02,318 1958-03,362 1958-04,348 1958-05,363 1958-06,435 1958-07,491 1958-08,505 1958-09,404 1958-10,359 1958-11,310 1958-12,337 1959-01,360 1959-02,342 1959-03,406 1959-04,396 1959-05,420 1959-06,472 1959-07,548 1959-08,559 1959-09,463 1959-10,407 1959-11,362 1959-12,405 1960-01,417 1960-02,391 1960-03,419 1960-04,461 1960-05,472 1960-06,535 1960-07,622 1960-08,606 1960-09,508 1960-10,461 1960-11,390 1960-12,432 \ No newline at end of file diff --git a/resources/sarimax_explanation/catfish.csv b/resources/sarimax_explanation/catfish.csv new file mode 100644 index 0000000..1e21eef --- /dev/null +++ b/resources/sarimax_explanation/catfish.csv @@ -0,0 +1,325 @@ +Date,Total +1986-1-01,9034 +1986-2-01,9596 +1986-3-01,10558 +1986-4-01,9002 +1986-5-01,9239 +1986-6-01,8951 +1986-7-01,9668 +1986-8-01,10188 +1986-9-01,9896 +1986-10-01,10649 +1986-11-01,8917 +1986-12-01,8196 +1987-1-01,10768 +1987-2-01,12220 +1987-3-01,14463 +1987-4-01,12944 +1987-5-01,11001 +1987-6-01,11000 +1987-7-01,11876 +1987-8-01,13021 +1987-9-01,13494 +1987-10-01,14041 +1987-11-01,11312 +1987-12-01,10362 +1988-1-01,12533 +1988-2-01,14343 +1988-3-01,14676 +1988-4-01,12253 +1988-5-01,11840 +1988-6-01,12018 +1988-7-01,11819 +1988-8-01,12453 +1988-9-01,12309 +1988-10-01,13005 +1988-11-01,11815 +1988-12-01,10496 +1989-1-01,13996 +1989-2-01,15158 +1989-3-01,15587 +1989-4-01,14615 +1989-5-01,14792 +1989-6-01,14373 +1989-7-01,14573 +1989-8-01,14385 +1989-9-01,15607 +1989-10-01,15978 +1989-11-01,14436 +1989-12-01,12794 +1990-1-01,15771 +1990-2-01,16476 +1990-3-01,16420 +1990-4-01,15444 +1990-5-01,16118 +1990-6-01,15158 +1990-7-01,15214 +1990-8-01,16257 +1990-9-01,15287 +1990-10-01,15212 +1990-11-01,13488 +1990-12-01,12300 +1991-1-01,15794 +1991-2-01,17113 +1991-3-01,17536 +1991-4-01,15914 +1991-5-01,16716 +1991-6-01,15983 +1991-7-01,16878 +1991-8-01,17893 +1991-9-01,16697 +1991-10-01,18546 +1991-11-01,15486 +1991-12-01,15253 +1992-1-01,18698 +1992-2-01,21100 +1992-3-01,22425 +1992-4-01,19923 +1992-5-01,19454 +1992-6-01,18874 +1992-7-01,19676 +1992-8-01,19559 +1992-9-01,19500 +1992-10-01,19615 +1992-11-01,16814 +1992-12-01,15698 +1993-1-01,20273 +1993-2-01,20774 +1993-3-01,21309 +1993-4-01,19377 +1993-5-01,19211 +1993-6-01,19103 +1993-7-01,20086 +1993-8-01,19758 +1993-9-01,19162 +1993-10-01,19371 +1993-11-01,17709 +1993-12-01,17342 +1994-1-01,18772 +1994-2-01,19363 +1994-3-01,20001 +1994-4-01,17112 +1994-5-01,18027 +1994-6-01,17173 +1994-7-01,18024 +1994-8-01,18983 +1994-9-01,17983 +1994-10-01,19151 +1994-11-01,16427 +1994-12-01,15461 +1995-1-01,19191 +1995-2-01,20008 +1995-3-01,21702 +1995-4-01,18649 +1995-5-01,19169 +1995-6-01,18631 +1995-7-01,18157 +1995-8-01,20187 +1995-9-01,18660 +1995-10-01,19920 +1995-11-01,16680 +1995-12-01,16018 +1996-1-01,20322 +1996-2-01,20613 +1996-3-01,22704 +1996-4-01,20276 +1996-5-01,20669 +1996-6-01,18074 +1996-7-01,18719 +1996-8-01,20217 +1996-9-01,19642 +1996-10-01,20842 +1996-11-01,18204 +1996-12-01,16898 +1997-1-01,20746 +1997-2-01,23058 +1997-3-01,24624 +1997-4-01,22154 +1997-5-01,22444 +1997-6-01,21471 +1997-7-01,21866 +1997-8-01,22548 +1997-9-01,21518 +1997-10-01,23408 +1997-11-01,19645 +1997-12-01,18278 +1998-1-01,23576 +1998-2-01,26650 +1998-3-01,26207 +1998-4-01,23195 +1998-5-01,22960 +1998-6-01,23002 +1998-7-01,22973 +1998-8-01,24089 +1998-9-01,22805 +1998-10-01,23241 +1998-11-01,21581 +1998-12-01,21119 +1999-1-01,23107 +1999-2-01,25780 +1999-3-01,28544 +1999-4-01,23488 +1999-5-01,23964 +1999-6-01,23720 +1999-7-01,25069 +1999-8-01,24618 +1999-9-01,24430 +1999-10-01,25229 +1999-11-01,22344 +1999-12-01,22372 +2000-1-01,25412 +2000-2-01,25354 +2000-3-01,29161 +2000-4-01,24924 +2000-5-01,24763 +2000-6-01,25342 +2000-7-01,24911 +2000-8-01,25847 +2000-9-01,23743 +2000-10-01,25036 +2000-11-01,21911 +2000-12-01,20752 +2001-1-01,24507 +2001-2-01,25968 +2001-3-01,28752 +2001-4-01,25167 +2001-5-01,24728 +2001-6-01,23690 +2001-7-01,24816 +2001-8-01,26004 +2001-9-01,24210 +2001-10-01,25083 +2001-11-01,21807 +2001-12-01,21635 +2002-1-01,27173 +2002-2-01,29308 +2002-3-01,28645 +2002-4-01,25023 +2002-5-01,27261 +2002-6-01,24670 +2002-7-01,26441 +2002-8-01,27961 +2002-9-01,26498 +2002-10-01,27800 +2002-11-01,23939 +2002-12-01,22930 +2003-1-01,27584 +2003-2-01,27586 +2003-3-01,30485 +2003-4-01,26135 +2003-5-01,27370 +2003-6-01,25487 +2003-7-01,26427 +2003-8-01,27672 +2003-9-01,26853 +2003-10-01,27875 +2003-11-01,23416 +2003-12-01,22482 +2004-1-01,27140 +2004-2-01,28526 +2004-3-01,28845 +2004-4-01,25033 +2004-5-01,24764 +2004-6-01,24896 +2004-7-01,24623 +2004-8-01,26538 +2004-9-01,24674 +2004-10-01,25863 +2004-11-01,23156 +2004-12-01,22721 +2005-1-01,26204 +2005-2-01,26526 +2005-3-01,27473 +2005-4-01,24536 +2005-5-01,25764 +2005-6-01,25154 +2005-7-01,23729 +2005-8-01,25336 +2005-9-01,24649 +2005-10-01,25904 +2005-11-01,22868 +2005-12-01,21825 +2006-1-01,26955 +2006-2-01,27349 +2006-3-01,29367 +2006-4-01,24071 +2006-5-01,23173 +2006-6-01,21740 +2006-7-01,22056 +2006-8-01,23923 +2006-9-01,22189 +2006-10-01,22458 +2006-11-01,19476 +2006-12-01,21056 +2007-1-01,22842 +2007-2-01,22943 +2007-3-01,23445 +2007-4-01,19176 +2007-5-01,22058 +2007-6-01,19451 +2007-7-01,20378 +2007-8-01,21778 +2007-9-01,20431 +2007-10-01,22486 +2007-11-01,18889 +2007-12-01,18574 +2008-1-01,24658 +2008-2-01,24997 +2008-3-01,23987 +2008-4-01,21199 +2008-5-01,21205 +2008-6-01,21553 +2008-7-01,21166 +2008-8-01,20720 +2008-9-01,18067 +2008-10-01,21121 +2008-11-01,16617 +2008-12-01,15917 +2009-1-01,19262 +2009-2-01,20658 +2009-3-01,22660 +2009-4-01,20147 +2009-5-01,18818 +2009-6-01,19017 +2009-7-01,19451 +2009-8-01,18502 +2009-9-01,18893 +2009-10-01,19480 +2009-11-01,15743 +2009-12-01,16554 +2010-1-01,19465 +2010-2-01,22617 +2010-3-01,22438 +2010-4-01,18511 +2010-5-01,19142 +2010-6-01,19063 +2010-7-01,18006 +2010-8-01,20386 +2010-9-01,20280 +2010-10-01,18894 +2010-11-01,15941 +2010-12-01,16851 +2011-1-01,18204 +2011-2-01,15564 +2011-3-01,17984 +2011-4-01,13979 +2011-5-01,12591 +2011-6-01,12408 +2011-7-01,12779 +2011-8-01,14233 +2011-9-01,13410 +2011-10-01,12893 +2011-11-01,11843 +2011-12-01,11321 +2012-1-01,13427 +2012-2-01,14447 +2012-3-01,14717 +2012-4-01,11998 +2012-5-01,13379 +2012-6-01,12463 +2012-7-01,13276 +2012-8-01,14442 +2012-9-01,13422 +2012-10-01,13795 +2012-11-01,13352 +2012-12-01,12716 diff --git a/resources/sarimax_explanation/retail_sales.csv b/resources/sarimax_explanation/retail_sales.csv new file mode 100644 index 0000000..0b5499c --- /dev/null +++ b/resources/sarimax_explanation/retail_sales.csv @@ -0,0 +1,73 @@ +date,sales +2009-10-01,338630 +2009-11-01,339386 +2009-12-01,400264 +2010-01-01,314640 +2010-02-01,311022 +2010-03-01,360819 +2010-04-01,356460 +2010-05-01,365713 +2010-06-01,358675 +2010-07-01,362027 +2010-08-01,362682 +2010-09-01,346069 +2010-10-01,355212 +2010-11-01,365809 +2010-12-01,426654 +2011-01-01,335608 +2011-02-01,337352 +2011-03-01,387092 +2011-04-01,380754 +2011-05-01,391970 +2011-06-01,388636 +2011-07-01,384600 +2011-08-01,394548 +2011-09-01,374895 +2011-10-01,379364 +2011-11-01,391081 +2011-12-01,451669 +2012-01-01,355058 +2012-02-01,372523 +2012-03-01,414275 +2012-04-01,393035 +2012-05-01,418648 +2012-06-01,400996 +2012-07-01,396020 +2012-08-01,417911 +2012-09-01,385597 +2012-10-01,399341 +2012-11-01,410992 +2012-12-01,461994 +2013-01-01,375537 +2013-02-01,373938 +2013-03-01,421638 +2013-04-01,408381 +2013-05-01,436985 +2013-06-01,414701 +2013-07-01,422357 +2013-08-01,434950 +2013-09-01,396199 +2013-10-01,415740 +2013-11-01,423611 +2013-12-01,477205 +2014-01-01,383399 +2014-02-01,380315 +2014-03-01,432806 +2014-04-01,431415 +2014-05-01,458822 +2014-06-01,433152 +2014-07-01,443005 +2014-08-01,450913 +2014-09-01,420871 +2014-10-01,437702 +2014-11-01,437910 +2014-12-01,501232 +2015-01-01,397252 +2015-02-01,386935 +2015-03-01,444110 +2015-04-01,438217 +2015-05-01,462615 +2015-06-01,448229 +2015-07-01,457710 +2015-08-01,456340 +2015-09-01,430917 diff --git a/sarimax_explanation.Rmd b/sarimax_explanation.Rmd new file mode 100644 index 0000000..25f2f51 --- /dev/null +++ b/sarimax_explanation.Rmd @@ -0,0 +1,653 @@ +# Sarimax explanation + +Yishi Wang + + +```{r, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` +## Setup +```{python} +from pandas import read_csv +from statsmodels.graphics.tsaplots import plot_acf, plot_pacf +from statsmodels.tsa.stattools import acf, pacf +from statsmodels.tsa.ar_model import AR +from statsmodels.tsa.arima_model import ARIMA +from datetime import datetime, timedelta +import matplotlib +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import warnings +from sklearn.metrics import r2_score +from statsmodels.tsa.arima_model import ARMA +from scipy.stats import norm +import statsmodels.api as sm +import requests +from io import BytesIO +``` +```{python} +matplotlib.rc('xtick', labelsize=20) +matplotlib.rc('ytick', labelsize=20) +plt.rcParams.update({'font.size': 22}) +warnings.filterwarnings('ignore') +pd.plotting.register_matplotlib_converters() +``` +## ACF (Autocorrelation fuction) + +Time-series data by definition are ordered by time. Therefore, there is extra information in it people could take advantage of. We use ACF to decide which past events have heavier impect on current data. + +Let's look on raw data of air passengers per month: +```{python} +data = pd.read_csv('resources/sarimax_explanation/AirPassengers.csv').drop(['Month'],axis=1)#.head(100) +data_a = data.to_numpy().T[0] +plt.figure(figsize=(20,10)) +plt.xlabel("Months") +plt.ylabel("Number of air passengers") +plt.grid() +plt.plot(data_a) +``` +and its ACF plot: +```{python} +plt.rc("figure", figsize=(20,10)) +plt.figure(figsize=(20,10)) +plot_acf(data_a, lags=50) +plt.xlabel("Months") +plt.grid() +plt.show() +``` +What we can see here are bars and a horizontal cone. This cone pictures the confidence level (by default set to 95%). In other words, if the point is outside the cone (on white) we may say that with 95% probability it has a certain impact on values. If the bar is inside the cone (on blue) we may ignore this particular lag as most likely it is not relevant. + +the bars, which are vital here, show how much past data point impacts the following ones: + +1. The first point (with index 0) has a height 1. In fact the first one is always 1. It make sense as current value always explain fully current value. +2. The second point is around 0.9 which means that following point (directly next one) is described in 90% by the previous value. +3. Eleventh bar has a height of 0.75. This means that current data will impact data in 11 month by 75% (data resolution is monthly). + +## PACF(Partial autocorrelation function) +In time series analysis, the partial autocorrelation function (PACF) gives the partial correlation of a stationary time series with its own lagged values, regressed the values of the time series at all shorter lags. It contrasts with the autocorrelation function, which does not control for other lags. + +This function plays an important role in data analysis aimed at identifying the extent of the lag in an autoregressive model. The use of this function was introduced as part of the Box–Jenkins approach to time series modelling, whereby plotting the partial autocorrelative functions one could determine the appropriate lags p in an AR (p) model or in an extended ARIMA (p,d,q) model. + +## AR(Autoregression) Model +Autoregression modeling is a modeling technique used for time series data that assumes linear continuation of the series so that previous values in the time series can be used to predict futures values. Different from linear regression model, With the autoregression model, we use previous data points to predict future data point(s) but with multiple lag variables. + +An exmaple of AR model: +$$ +y = a + b_1X_{t-1}+b_2X_{t-2}+b_3X_{t-3} +$$ +where $a$, $b_1$, $b_2$ and $b_3$ are variables found during the training of the model and $X_{t-1}$, $X_{t-2}$ and $X_{t-3}$ are input variables at previous times within the data set and t is the lag value. + +To AR modeling, there are two assumptiopns: + +1. The previous time step(s) is useful in predicting the value at the next time step (dependance between values) +2. The data is stationary. A time series is stationary if is mean (and/or variance) is constant over time. There are other statistical properties to look at as well, but looking at the mean is usually the fastest/easiest. + +Here is an exmaple, the exmaple uses retial sales data. +```{python} +sales_data = pd.read_csv('resources/sarimax_explanation/retail_sales.csv') +sales_data['date']=pd.to_datetime(sales_data['date']) +sales_data.set_index('date', inplace=True) + +plt.figure(figsize=(20,10)) +plt.xlabel("Year") +plt.ylabel("Sales") +plt.grid() +plt.title('Sales VS years') +plt.plot(sales_data) +``` +plot its ACF: + +```{python} + +plt.rc("figure", figsize=(20,10)) +pd.plotting.autocorrelation_plot(sales_data['sales']) +plt.title("ACF") +plt.show() +``` + +From the ACF plot above, we can see there’s some significant correlation between t=1 and t=12 (roughly) with significant decline in correlation after that timeframe. Since we are looking at monthly sales data, this seems to make sense with correlations falling off at the start of the new fiscal year. + +From looking at above charts, we find that it is a non-stationary dataset due to the increasing trend from lower left to upper right.For simplicity, we don't test its trend in this document and use quick/dirty method of differencing to get a more stationary model: + +```{python} +sales_data['stationary']=sales_data['sales'].diff() +plt.figure(figsize=(20,10)) +plt.xlabel("Year") +plt.ylabel("Sales difference") +plt.grid() +plt.plot(sales_data['stationary']) +``` + + + Since the data becomes stationary, we then apply AR modeling in **statsmodels.libtrary**. + +```{python} +#create train/test datasets +X = sales_data['stationary'].dropna() +train_data = X[1:len(X)-12] +test_data = X[len(X)-12:] + +#train the autoregression model +model = AR(train_data) +model_fitted = model.fit() +``` + +In the above, we are simply creating a testing and training dataset and then creating and fitting our AR() model. Once we've fit the model, we can look at the chosen lag and parameters of the model using some simple print statements: +```{python} +print('The lag value chose is: %s' % model_fitted.k_ar) +``` + +```{python} +print('The coefficients of the model are:\n %s' % model_fitted.params) +``` +If we look back at our autocorrelation plot, we can see that the lag value of 10 is where the line first touches the 95% confidence level, which is usually the way we’d select the lag value when we first run autoregression models if we were selecting things manually, so the selection makes sense. + +Then we can make some forecasts: +```{python} +plt.rc("figure", figsize=(20,10)) +# make predictions +predictions = model_fitted.predict( + start=len(train_data), + end=len(train_data) + len(test_data)-1, + dynamic=False) + +# create a comparison dataframe +compare_df = pd.concat( + [sales_data['stationary'].tail(12), + predictions], axis=1).rename( + columns={'stationary': 'actual', 0:'predicted'}) + +# plot the two values +plt.grid() +compare_df.plot() + + +``` +The mean square error of the prediction is: +```{python} +r2_score(sales_data['stationary'].tail(12), predictions) +``` +This gives us a root mean square value of 0.64, which isn’t terrible but there is room for improvement here. + + +## MA(Moving Average) + +In statistics, a moving average (rolling average or running average) is a calculation to analyze data points by creating a series of averages of different subsets of the full data set. It is also called a moving mean (MM) or rolling mean and is a type of finite impulse response filter. Variations include: simple, and cumulative, or weighted forms. + +Given a series of numbers and a fixed subset size, the first element of the moving average is obtained by taking the average of the initial fixed subset of the number series. Then the subset is modified by "shifting forward"; that is, excluding the first number of the series and including the next value in the subset. + +A moving average is commonly used with time series data to smooth out short-term fluctuations and highlight longer-term trends or cycles. + +**Here is a example for applying MA:** + +Firstly, we generate some data using the equations below: +$$ +y_t = 50 + 0.4 \epsilon_{t-1} + 0.3\epsilon_{t-2} + \epsilon_{t} +$$ +$$ +\epsilon_t \sim N(0,1) +$$ + +```{python} +errors = np.random.normal(0, 1, 400) +``` + + +```{python} +date_index = pd.date_range(start='9/1/2019', end='1/1/2020') +``` + +```{python} +mu = 50 +series = [] +for t in range(1,len(date_index)+1): + series.append(mu + 0.4*errors[t-1] + 0.3*errors[t-2] + errors[t]) +``` + + +```{python} +series = pd.Series(series, date_index) +series = series.asfreq(pd.infer_freq(series.index)) +``` + + +```{python} +plt.figure(figsize=(20,4)) +plt.plot(series) +plt.axhline(mu, linestyle='--', color='grey') +``` +```{python} +def calc_corr(series, lag): + return pearsonr(series[:-lag], series[lag:])[0] +``` +Draw its ACF graph: +```{python} +acf_vals = acf(series) +num_lags = 10 +plt.figure(figsize=(10,4)) +plt.bar(range(num_lags), acf_vals[:num_lags]) +plt.show() +``` +Its PACF graph: +```{python} +pacf_vals = pacf(series) +num_lags = 25 +plt.figure(figsize=(10,4)) +plt.bar(range(num_lags), pacf_vals[:num_lags]) +plt.show() +``` +Fit MA Model +```{python} +#get training and testing sets +train_end = datetime(2019,12,30) +test_end = datetime(2020,1,1) + +train_data = series[:train_end] +test_data = series[train_end + timedelta(days=1):test_end] + +#Fit MA model +model = ARIMA(train_data, order=(0,0,2)) +model_fit = model.fit() +print(model_fit.summary()) +``` + +According to the summary, we've got the predicted model: +$$ +\hat y_t = 50 +0.37\epsilon_{t-1} + 0.25_ +{t-2} +$$ +Use mode lto predict the data and calculate its error: +```{python} +#get prediction start and end dates +pred_start_date = test_data.index[0] +pred_end_date = test_data.index[-1] + +#get the predictions and residuals +predictions = model_fit.predict(start=pred_start_date, end=pred_end_date) + +residuals = test_data - predictions + +#plot the prediction +plt.figure(figsize=(20,4)) + +plt.plot(series[-14:]) +plt.plot(predictions) + +plt.legend(('Data', 'Predictions'), fontsize=16) +``` + +```{python} +print('Mean Absolute Percent Error:', round(np.mean(abs(residuals/test_data)),4)) +``` + + +```{python} +print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2))) +``` +## ARMA(Autoregressive–moving-average) model + +In the statistical analysis of time series, autoregressive–moving-average (ARMA) models provide a parsimonious description of a (weakly) stationary stochastic process in terms of two polynomials, one for the autoregression (AR) and the second for the moving average (MA). The general ARMA model was described in the 1951 thesis of Peter Whittle, Hypothesis testing in time series analysis, and it was popularized in the 1970 book by George E. P. Box and Gwilym Jenkins. + +Given a time series of data Xt , the ARMA model is a tool for understanding and, perhaps, predicting future values in this series. The AR part involves regressing the variable on its own lagged (i.e., past) values. The MA part involves modeling the error term as a linear combination of error terms occurring contemporaneously and at various times in the past. The model is usually referred to as the ARMA(p,q) model where p is the order of the AR part and q is the order of the MA part. + +Here is an exmaple using ARMA model to predict the catfish sales data following the similar steps of MA prediction above: +```{python} + +def parser(s): + return datetime.strptime(s,'%Y-%m-%d') +``` +```{python} +#read data +catfish_sales = pd.read_csv('resources/sarimax_explanation/catfish.csv', parse_dates=[0], index_col=0, squeeze=True, date_parser=parser) + +#infer the frequency of the data +catfish_sales = catfish_sales.asfreq(pd.infer_freq(catfish_sales.index)) + +start_date = datetime(2000,1,1) +end_date = datetime(2004,1,1) +lim_catfish_sales = catfish_sales[start_date:end_date] + +#plot the original data +plt.figure(figsize=(20,4)) +plt.plot(lim_catfish_sales) +plt.title('Catfish Sales in 1000s of Pounds', fontsize=20) +plt.ylabel('Sales', fontsize=16) +for year in range(start_date.year,end_date.year): + plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2) +plt.axhline(lim_catfish_sales.mean(), color='r', alpha=0.2, linestyle='--') +``` +ARMA model also requires the data to be stationary while our data here looks a little bit went upward as time went by. Just in case, we use the first difference: +```{python} +first_diff = lim_catfish_sales.diff()[1:] +plt.figure(figsize=(20,4)) +plt.plot(first_diff) +plt.title('First Difference of Catfish Sales', fontsize=20) +plt.ylabel('Sales', fontsize=16) +for year in range(start_date.year,end_date.year): + plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2) +plt.axhline(first_diff.mean(), color='r', alpha=0.2, linestyle='--') +``` +**ACF** +```{python} +acf_vals = acf(first_diff) +plt.figure(figsize=(10,4)) +plt.bar(range(num_lags), acf_vals[:num_lags]) +plt.show() +``` + +Based on ACF, we should start with a MA(1) process + +**PACF** + +```{python} +num_lags = 15 +pacf_vals = pacf(first_diff, nlags=num_lags) +plt.figure(figsize=(10,4)) +plt.bar(range(num_lags), pacf_vals[:num_lags]) +plt.show() +``` +Based on PACF, we should start with a AR(4) process. Though lags after 11 seem have strong impacts, it may caused by seasonality, which we will discuss later and ignore for now. + +**training** +```{python} +#get training and testing sets +train_end = datetime(2003,7,1) +test_end = datetime(2004,1,1) + +train_data = first_diff[:train_end] +test_data = first_diff[train_end + timedelta(days=1):test_end] + + +# define model +model = ARMA(train_data, order=(4,1)) + +#fit the model +model_fit = model.fit() + +``` + + +```{python} + +#summary of the model +print(model_fit.summary()) +#get prediction start and end dates +pred_start_date = test_data.index[0] +pred_end_date = test_data.index[-1] + +#get the predictions and residuals +predictions = model_fit.predict(start=pred_start_date, end=pred_end_date) +residuals = test_data - predictions +``` +So the ARMA(4,1) model is: +$$ +\hat y_t = -0.87 y_{t-1} -0.42 y_{t-2} - 0.56y_{t-3} - 0.61y_{t-4} + 0.52\epsilon_{t-1} +$$ +**prediction results** +```{python} +plt.figure(figsize=(20,4)) +plt.plot(residuals) +plt.title('Residuals from AR Model', fontsize=20) +plt.ylabel('Error', fontsize=16) +plt.axhline(0, color='r', linestyle='--', alpha=0.2) +``` +```{python} +plt.figure(figsize=(20,4)) + +plt.plot(test_data) +plt.plot(predictions) + +plt.legend(('Data', 'Predictions'), fontsize=16) + +plt.title('First Difference of Catfish Sales', fontsize=20) +plt.ylabel('Sales', fontsize=16) +``` +```{python} +print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2))) +``` +## ARIMA(Autoregressive integrated moving average) Model +ARIMA models are applied in some cases where data show evidence of non-stationarity, where an initial differencing step (corresponding to the "integrated" part of the model) can be applied one or more times to eliminate the non-stationarity. + +The I (for "integrated") indicates that the data values have been replaced with the difference between their values and the previous values (and this differencing process may have been performed more than once). + +Here is an example using ARIMA model to fit wholesale price index(WPI) data, which is not stationary: + +```{python} +plt.rc("figure", figsize=(16,8)) +plt.rc("font", size=14) +``` + + +```{python} +# Dataset +wpi1 = requests.get('https://www.stata-press.com/data/r12/wpi1.dta').content +data = pd.read_stata(BytesIO(wpi1)) +data.index = data.t +# Set the frequency +data.index.freq="QS-OCT" + +# Fit the model +# Here we set integrated of order 1, so that the difference is assumed to be stationary, and fit a model with one autoregressive lag and one moving average lag, as well as an intercept term. +mod = sm.tsa.statespace.SARIMAX(data['wpi'], trend='c', order=(1,1,1)) +res = mod.fit(disp=False) +print(res.summary()) +``` + +According to the summary, we have: +$$ +\Delta y_t = 0.0943 + 0.8742 \Delta y_{t-1} - 0.4120\epsilon_{t-1} + \epsilon_t +$$ + +## SARIMA model +This model is an extension of ARIMA model. The new part of this model is that there is allowed to be a seasonal effect. The second difference is that this model uses the log of the data rather than the level. + +```{python} +# Dataset +data = pd.read_stata(BytesIO(wpi1)) +data.index = data.t +data.index.freq="QS-OCT" + +data['ln_wpi'] = np.log(data['wpi']) +data['D.ln_wpi'] = data['ln_wpi'].diff() +``` + + +```{python} +# Graph data +fig, axes = plt.subplots(1, 2, figsize=(20,4)) + +# Levels +axes[0].plot(data.index._mpl_repr(), data['wpi'], '-') +axes[0].set(title='US Wholesale Price Index') + +# Log difference +axes[1].plot(data.index._mpl_repr(), data['D.ln_wpi'], '-') +axes[1].hlines(0, data.index[0], data.index[-1], 'r') +axes[1].set(title='US Wholesale Price Index - difference of logs') +plt.show() +``` +```{python} +# Graph data +fig, axes = plt.subplots(1, 2, figsize=(20,4)) + +fig = sm.graphics.tsa.plot_acf(data.iloc[1:]['D.ln_wpi'], lags=40, ax=axes[0]) +fig = sm.graphics.tsa.plot_pacf(data.iloc[1:]['D.ln_wpi'], lags=40, ax=axes[1]) +plt.show() +``` + + +From the first two graphs, we note that the original time series does not appear to be stationary, whereas the first-difference does. This supports either estimating an ARMA model on the first-difference of the data, or estimating an ARIMA model with 1 order of integration (recall that we are taking the latter approach). The last two graphs support the use of an ARMA(1,1,1) model. + +What we want is a polynomial that has terms for the 1st and 4th degrees, but leaves out the 2nd and 3rd terms. To do that, we need to provide a tuple for the specification parameter, where the tuple describes the lag polynomial itself. In particular, here we would want to use: + +```{python} +ar = 1 # this is the maximum degree specification +ma = (1,0,0,1) # this is the lag polynomial specification + +# Fit the model +mod = sm.tsa.statespace.SARIMAX(data['ln_wpi'], trend='c', order=(ar,1,ma)) +res = mod.fit(disp=False) +print(res.summary()) +``` + +Alternatively, we can use $$ARIMA(p,d,q)\times (P,D,Q)_s$$, where the lowercase letters indicate the specification for the non-seasonal component, and the uppercase letters indicate the specification for the seasonal component; s is the periodicity of the seasons (e.g. it is often 4 for quarterly data or 12 for monthly data). + +Here we use airline data as an exmaple: +```{python} +# Dataset +air2 = requests.get('https://www.stata-press.com/data/r12/air2.dta').content +data = pd.read_stata(BytesIO(air2)) +data.index = pd.date_range(start=datetime(data.time[0], 1, 1), periods=len(data), freq='MS') +data['lnair'] = np.log(data['air']) + +# Fit the model +mod = sm.tsa.statespace.SARIMAX(data['lnair'], order=(2,1,0), seasonal_order=(1,1,0,12), simple_differencing=True) +res = mod.fit(disp=False) +print(res.summary()) + +``` +Notice that here we used an additional argument simple_differencing=True. This controls how the order of integration is handled in ARIMA models. If simple_differencing=True, then the time series provided as endog is literally differenced and an ARMA model is fit to the resulting new time series. This implies that a number of initial periods are lost to the differencing process, however it may be necessary either to compare results to other packages (e.g. Stata’s arima always uses simple differencing) or if the seasonal periodicity is large. + +The default is simple_differencing=False, in which case the integration component is implemented as part of the state space formulation, and all of the original data can be used in estimation. + +## ARIMAX + +This model demonstrates the use of explanatory variables (the X part of ARMAX). When exogenous regressors are included, the SARIMAX module uses the concept of “regression with SARIMA errors” (see http://robjhyndman.com/hyndsight/arimax/ for details of regression with ARIMA errors versus alternative specifications) + +Here is the example using ARIMAX: +```{python} +# Dataset +friedman2 = requests.get('https://www.stata-press.com/data/r12/friedman2.dta').content +data = pd.read_stata(BytesIO(friedman2)) +data.index = data.time +data.index.freq = "QS-OCT" + +# Variables +endog = data.loc['1959':'1981', 'consump'] +exog = sm.add_constant(data.loc['1959':'1981', 'm2']) + +# Fit the model +mod = sm.tsa.statespace.SARIMAX(endog, exog, order=(1,0,1)) +res = mod.fit(disp=False) +print(res.summary()) + +``` + +## ARIMA Postestimation + +Finally, here we describe some of the post-estimation capabilities of statsmodels’ SARIMAX. + +First, using the model from example, we estimate the parameters using data that excludes the last few observations (this is a little artificial as an example, but it allows considering performance of out-of-sample forecasting and facilitates comparison to Stata’s documentation). + +```{python} +# Dataset +raw = pd.read_stata(BytesIO(friedman2)) +raw.index = raw.time +raw.index.freq = "QS-OCT" +data = raw.loc[:'1981'] + +# Variables +endog = data.loc['1959':, 'consump'] +exog = sm.add_constant(data.loc['1959':, 'm2']) +nobs = endog.shape[0] + +# Fit the model +mod = sm.tsa.statespace.SARIMAX(endog.loc[:'1978-01-01'], exog=exog.loc[:'1978-01-01'], order=(1,0,1)) +fit_res = mod.fit(disp=False, maxiter=250) +print(fit_res.summary()) +``` +Next, we want to get results for the full dataset but using the estimated parameters (on a subset of the data). +```{python} +mod = sm.tsa.statespace.SARIMAX(endog, exog=exog, order=(1,0,1)) +res = mod.filter(fit_res.params) +``` +The predict command is first applied here to get in-sample predictions. We use the full_results=True argument to allow us to calculate confidence intervals (the default output of predict is just the predicted values). + +With no other arguments, predict returns the one-step-ahead in-sample predictions for the entire sample. + +```{python} +# In-sample one-step-ahead predictions +predict = res.get_prediction() +predict_ci = predict.conf_int() +``` + +We can also get dynamic predictions. One-step-ahead prediction uses the true values of the endogenous values at each step to predict the next in-sample value. Dynamic predictions use one-step-ahead prediction up to some point in the dataset (specified by the dynamic argument); after that, the previous predicted endogenous values are used in place of the true endogenous values for each new predicted element. + +The dynamic argument is specified to be an offset relative to the start argument. If start is not specified, it is assumed to be 0. + +Here we perform dynamic prediction starting in the first quarter of 1978. + +```{python} +# Dynamic predictions +predict_dy = res.get_prediction(dynamic='1978-01-01') +predict_dy_ci = predict_dy.conf_int() +``` + +We can graph the one-step-ahead and dynamic predictions (and the corresponding confidence intervals) to see their relative performance. Notice that up to the point where dynamic prediction begins (1978:Q1), the two are the same. + +```{python} +# Graph +fig, ax = plt.subplots(figsize=(20,4)) +npre = 4 +ax.set(title='Personal consumption', xlabel='Date', ylabel='Billions of dollars') + +# Plot data points +data.loc['1977-07-01':, 'consump'].plot(ax=ax, style='o', label='Observed') + +# Plot predictions +predict.predicted_mean.loc['1977-07-01':].plot(ax=ax, style='r--', label='One-step-ahead forecast') +ci = predict_ci.loc['1977-07-01':] +ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='r', alpha=0.1) +predict_dy.predicted_mean.loc['1977-07-01':].plot(ax=ax, style='g', label='Dynamic forecast (1978)') +ci = predict_dy_ci.loc['1977-07-01':] +ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='g', alpha=0.1) + +legend = ax.legend(loc='lower right') +plt.show() +``` + +Finally, graph the prediction error. It is obvious that, as one would suspect, one-step-ahead prediction is considerably better. + +```{python} +# Prediction error + +# Graph +fig, ax = plt.subplots(figsize=(20,4)) +npre = 4 +ax.set(title='Forecast error', xlabel='Date', ylabel='Forecast - Actual') + +# In-sample one-step-ahead predictions and 95% confidence intervals +predict_error = predict.predicted_mean - endog +predict_error.loc['1977-10-01':].plot(ax=ax, label='One-step-ahead forecast') +ci = predict_ci.loc['1977-10-01':].copy() +ci.iloc[:,0] -= endog.loc['1977-10-01':] +ci.iloc[:,1] -= endog.loc['1977-10-01':] +ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], alpha=0.1) + +# Dynamic predictions and 95% confidence intervals +predict_dy_error = predict_dy.predicted_mean - endog +predict_dy_error.loc['1977-10-01':].plot(ax=ax, style='r', label='Dynamic forecast (1978)') +ci = predict_dy_ci.loc['1977-10-01':].copy() +ci.iloc[:,0] -= endog.loc['1977-10-01':] +ci.iloc[:,1] -= endog.loc['1977-10-01':] +ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='r', alpha=0.1) + +legend = ax.legend(loc='lower left'); +legend.get_frame().set_facecolor('w') +plt.show() +``` + +**Reference** + +1. [ACF] https://medium.com/@krzysztofdrelczuk/acf-autocorrelation-function-simple-explanation-with-python-example-492484c32711 +2. [PACF] https://en.wikipedia.org/wiki/Partial_autocorrelation_function +3. [AR] https://pythondata.com/forecasting-time-series-autoregression/ +4. [MA] https://en.wikipedia.org/wiki/Moving_average +5. [MA] https://github.com/ritvikmath/Time-Series-Analysis/blob/master/MA%20Model.ipynb +6. [ARMA] https://github.com/ritvikmath/Time-Series-Analysis/blob/master/ARMA%20Model.ipynb +7. [ARMA] https://en.wikipedia.org/wiki/Autoregressive%E2%80%93moving-average_model +8. [ARIMA] https://en.wikipedia.org/wiki/Autoregressive_integrated_moving_average +9. [ARIMAX] https://www.statsmodels.org/dev/examples/notebooks/generated/statespace_sarimax_stata.html# + + From fc26f37b1cf177ff9e6b11559d5c90899faa9072 Mon Sep 17 00:00:00 2001 From: lubin-liu <69323872+lubin-liu@users.noreply.github.com> Date: Thu, 17 Dec 2020 16:28:41 -0500 Subject: [PATCH 2/2] Update _bookdown.yml --- _bookdown.yml | 86 +-------------------------------------------------- 1 file changed, 1 insertion(+), 85 deletions(-) diff --git a/_bookdown.yml b/_bookdown.yml index a87ec31..a361ddf 100644 --- a/_bookdown.yml +++ b/_bookdown.yml @@ -9,90 +9,6 @@ language: # Chapter ordering -rmd_files: [ - # Part: Introduction - 'index.Rmd', # must be first chapter - 'sample_project.Rmd', - - # Part 2: Data Processing and Wrangling - 'r_data_transformation.Rmd', # contains PART header - 'data_preprocessing_tutorial.Rmd', - 'apply_family.Rmd', - 'using_sql_in_r.Rmd', - - # Part 3: Data Visualization - 'pairs_and_ggpairs.Rmd', # contains PART header - 'a_brief_introduction_to_seaborn.Rmd', - 'geommlbstadiums.Rmd', - 'radar_chart.Rmd', - 'likert_scale.Rmd', - 'a_simple_way_to_visualize_geographic_data.Rmd', - 'interactive_choropleth_map.Rmd', - 'introduction_to_violin_plot.Rmd', # missing data file - 'base_r_ggplot_graph.Rmd', - - # Part: Results Reporting - 'visualization_tools_in_real_work.Rmd', # contains PART header - 'color_in_r.Rmd', - #'visualization_nyc_covid19_rshiny.Rmd', #shiny doesn't work - 'illustrate_graphs_in_r.Rmd', - 'shiny_tutorial.Rmd', - - # Part: Complete Analyses - 'china_choropleth_map.Rmd', # contains PART header - 'benfords_law.Rmd', - 'visualizing_electoral_margins.Rmd', - 'survial_analysis.Rmd', - - # Part: Cheatsheets and Video Tutorials - 'mosaic_plot_cheatsheet.Rmd', # contains PART header - 'add_ts_cheatsheet.Rmd', - 'python_vs_r_cheatsheet.Rmd', - 'likert_data_cheatsheet.Rmd', - 'categorical_cheat_sheet.Rmd', - 'replication_crisis_psychology.Rmd', - 'tableau_data_story_tutorial.Rmd', - - # Part: Live Tutorials - 'tableau_intro_tutorial_demo_with_pset.Rmd', # contains PART header - 'datapipeline_design.Rmd', - 'project_pichon.Rmd', - 'american_history_trivia_night.Rmd', - 'consuming_terrorism_statistics.Rmd', - 'introduction_to_network_analysis.Rmd', - - # Part: Data Visualization - 'label_setting_techniques.Rmd', - - # Part: Translations - 'chinese_translation_of_candela_package.Rmd', # contains PART header - 'greek_edav_histogram.Rmd', - 'korean_translation_of_scatterplot.Rmd', - 'rmarkdown_tutorial_chinese_translation.Rmd', - 'ggplot2_tutorial_translated.Rmd', - 'chinese_tutorial_rvest_httr.Rmd', - 'website_sharing_whole_class_mosaic_plot_translation.Rmd', - 'multivariate_grouped_data_plot.Rmd', - - # Part: Other Topics - 'rselenium_tutorial.Rmd', # contains PART header - 'ggpubr_package.Rmd', - 'geographical_map_packages.Rmd', - 'health_datasets_for_the_final_project.Rmd', - 'layout_for_baseplot_and_ggpplot.Rmd', - 'a_basic_introduction_to_mcmc_in_r.Rmd', - 'automate_eda_with_dataexplorer.Rmd', - 'speed_up_in_r_programming.Rmd', - 'data_visualization_in_python.Rmd', - 'final_project_teammate_finder.Rmd', - 'among_us_stats.Rmd', - 'ipywidgets_tutorial.Rmd', - 'linkedin_professional_development_session.Rmd', - - # Part: Appendices - 'appendix_initial_setup.Rmd', # contains PART header - 'appendix_pull_request_tutorial.Rmd', - 'appendix_chapter_organization.Rmd' -] +'sarimax_explanation.Rmd', before_chapter_script: "_common.R"