From 88ca99094ee3afccdca706406d4d82ebcfaef7aa Mon Sep 17 00:00:00 2001
From: wangyis <40844333+wangyis@users.noreply.github.com>
Date: Sun, 6 Dec 2020 20:18:24 -0500
Subject: [PATCH 1/2] submission

---
 .../sarimax_explanation/AirPassengers.csv     |   1 +
 resources/sarimax_explanation/catfish.csv     | 325 +++++++++
 .../sarimax_explanation/retail_sales.csv      |  73 ++
 sarimax_explanation.Rmd                       | 653 ++++++++++++++++++
 4 files changed, 1052 insertions(+)
 create mode 100644 resources/sarimax_explanation/AirPassengers.csv
 create mode 100644 resources/sarimax_explanation/catfish.csv
 create mode 100644 resources/sarimax_explanation/retail_sales.csv
 create mode 100644 sarimax_explanation.Rmd

diff --git a/resources/sarimax_explanation/AirPassengers.csv b/resources/sarimax_explanation/AirPassengers.csv
new file mode 100644
index 0000000..3a127e2
--- /dev/null
+++ b/resources/sarimax_explanation/AirPassengers.csv
@@ -0,0 +1 @@
+Month,#Passengers1949-01,1121949-02,1181949-03,1321949-04,1291949-05,1211949-06,1351949-07,1481949-08,1481949-09,1361949-10,1191949-11,1041949-12,1181950-01,1151950-02,1261950-03,1411950-04,1351950-05,1251950-06,1491950-07,1701950-08,1701950-09,1581950-10,1331950-11,1141950-12,1401951-01,1451951-02,1501951-03,1781951-04,1631951-05,1721951-06,1781951-07,1991951-08,1991951-09,1841951-10,1621951-11,1461951-12,1661952-01,1711952-02,1801952-03,1931952-04,1811952-05,1831952-06,2181952-07,2301952-08,2421952-09,2091952-10,1911952-11,1721952-12,1941953-01,1961953-02,1961953-03,2361953-04,2351953-05,2291953-06,2431953-07,2641953-08,2721953-09,2371953-10,2111953-11,1801953-12,2011954-01,2041954-02,1881954-03,2351954-04,2271954-05,2341954-06,2641954-07,3021954-08,2931954-09,2591954-10,2291954-11,2031954-12,2291955-01,2421955-02,2331955-03,2671955-04,2691955-05,2701955-06,3151955-07,3641955-08,3471955-09,3121955-10,2741955-11,2371955-12,2781956-01,2841956-02,2771956-03,3171956-04,3131956-05,3181956-06,3741956-07,4131956-08,4051956-09,3551956-10,3061956-11,2711956-12,3061957-01,3151957-02,3011957-03,3561957-04,3481957-05,3551957-06,4221957-07,4651957-08,4671957-09,4041957-10,3471957-11,3051957-12,3361958-01,3401958-02,3181958-03,3621958-04,3481958-05,3631958-06,4351958-07,4911958-08,5051958-09,4041958-10,3591958-11,3101958-12,3371959-01,3601959-02,3421959-03,4061959-04,3961959-05,4201959-06,4721959-07,5481959-08,5591959-09,4631959-10,4071959-11,3621959-12,4051960-01,4171960-02,3911960-03,4191960-04,4611960-05,4721960-06,5351960-07,6221960-08,6061960-09,5081960-10,4611960-11,3901960-12,432
\ No newline at end of file
diff --git a/resources/sarimax_explanation/catfish.csv b/resources/sarimax_explanation/catfish.csv
new file mode 100644
index 0000000..1e21eef
--- /dev/null
+++ b/resources/sarimax_explanation/catfish.csv
@@ -0,0 +1,325 @@
+Date,Total
+1986-1-01,9034
+1986-2-01,9596
+1986-3-01,10558
+1986-4-01,9002
+1986-5-01,9239
+1986-6-01,8951
+1986-7-01,9668
+1986-8-01,10188
+1986-9-01,9896
+1986-10-01,10649
+1986-11-01,8917
+1986-12-01,8196
+1987-1-01,10768
+1987-2-01,12220
+1987-3-01,14463
+1987-4-01,12944
+1987-5-01,11001
+1987-6-01,11000
+1987-7-01,11876
+1987-8-01,13021
+1987-9-01,13494
+1987-10-01,14041
+1987-11-01,11312
+1987-12-01,10362
+1988-1-01,12533
+1988-2-01,14343
+1988-3-01,14676
+1988-4-01,12253
+1988-5-01,11840
+1988-6-01,12018
+1988-7-01,11819
+1988-8-01,12453
+1988-9-01,12309
+1988-10-01,13005
+1988-11-01,11815
+1988-12-01,10496
+1989-1-01,13996
+1989-2-01,15158
+1989-3-01,15587
+1989-4-01,14615
+1989-5-01,14792
+1989-6-01,14373
+1989-7-01,14573
+1989-8-01,14385
+1989-9-01,15607
+1989-10-01,15978
+1989-11-01,14436
+1989-12-01,12794
+1990-1-01,15771
+1990-2-01,16476
+1990-3-01,16420
+1990-4-01,15444
+1990-5-01,16118
+1990-6-01,15158
+1990-7-01,15214
+1990-8-01,16257
+1990-9-01,15287
+1990-10-01,15212
+1990-11-01,13488
+1990-12-01,12300
+1991-1-01,15794
+1991-2-01,17113
+1991-3-01,17536
+1991-4-01,15914
+1991-5-01,16716
+1991-6-01,15983
+1991-7-01,16878
+1991-8-01,17893
+1991-9-01,16697
+1991-10-01,18546
+1991-11-01,15486
+1991-12-01,15253
+1992-1-01,18698
+1992-2-01,21100
+1992-3-01,22425
+1992-4-01,19923
+1992-5-01,19454
+1992-6-01,18874
+1992-7-01,19676
+1992-8-01,19559
+1992-9-01,19500
+1992-10-01,19615
+1992-11-01,16814
+1992-12-01,15698
+1993-1-01,20273
+1993-2-01,20774
+1993-3-01,21309
+1993-4-01,19377
+1993-5-01,19211
+1993-6-01,19103
+1993-7-01,20086
+1993-8-01,19758
+1993-9-01,19162
+1993-10-01,19371
+1993-11-01,17709
+1993-12-01,17342
+1994-1-01,18772
+1994-2-01,19363
+1994-3-01,20001
+1994-4-01,17112
+1994-5-01,18027
+1994-6-01,17173
+1994-7-01,18024
+1994-8-01,18983
+1994-9-01,17983
+1994-10-01,19151
+1994-11-01,16427
+1994-12-01,15461
+1995-1-01,19191
+1995-2-01,20008
+1995-3-01,21702
+1995-4-01,18649
+1995-5-01,19169
+1995-6-01,18631
+1995-7-01,18157
+1995-8-01,20187
+1995-9-01,18660
+1995-10-01,19920
+1995-11-01,16680
+1995-12-01,16018
+1996-1-01,20322
+1996-2-01,20613
+1996-3-01,22704
+1996-4-01,20276
+1996-5-01,20669
+1996-6-01,18074
+1996-7-01,18719
+1996-8-01,20217
+1996-9-01,19642
+1996-10-01,20842
+1996-11-01,18204
+1996-12-01,16898
+1997-1-01,20746
+1997-2-01,23058
+1997-3-01,24624
+1997-4-01,22154
+1997-5-01,22444
+1997-6-01,21471
+1997-7-01,21866
+1997-8-01,22548
+1997-9-01,21518
+1997-10-01,23408
+1997-11-01,19645
+1997-12-01,18278
+1998-1-01,23576
+1998-2-01,26650
+1998-3-01,26207
+1998-4-01,23195
+1998-5-01,22960
+1998-6-01,23002
+1998-7-01,22973
+1998-8-01,24089
+1998-9-01,22805
+1998-10-01,23241
+1998-11-01,21581
+1998-12-01,21119
+1999-1-01,23107
+1999-2-01,25780
+1999-3-01,28544
+1999-4-01,23488
+1999-5-01,23964
+1999-6-01,23720
+1999-7-01,25069
+1999-8-01,24618
+1999-9-01,24430
+1999-10-01,25229
+1999-11-01,22344
+1999-12-01,22372
+2000-1-01,25412
+2000-2-01,25354
+2000-3-01,29161
+2000-4-01,24924
+2000-5-01,24763
+2000-6-01,25342
+2000-7-01,24911
+2000-8-01,25847
+2000-9-01,23743
+2000-10-01,25036
+2000-11-01,21911
+2000-12-01,20752
+2001-1-01,24507
+2001-2-01,25968
+2001-3-01,28752
+2001-4-01,25167
+2001-5-01,24728
+2001-6-01,23690
+2001-7-01,24816
+2001-8-01,26004
+2001-9-01,24210
+2001-10-01,25083
+2001-11-01,21807
+2001-12-01,21635
+2002-1-01,27173
+2002-2-01,29308
+2002-3-01,28645
+2002-4-01,25023
+2002-5-01,27261
+2002-6-01,24670
+2002-7-01,26441
+2002-8-01,27961
+2002-9-01,26498
+2002-10-01,27800
+2002-11-01,23939
+2002-12-01,22930
+2003-1-01,27584
+2003-2-01,27586
+2003-3-01,30485
+2003-4-01,26135
+2003-5-01,27370
+2003-6-01,25487
+2003-7-01,26427
+2003-8-01,27672
+2003-9-01,26853
+2003-10-01,27875
+2003-11-01,23416
+2003-12-01,22482
+2004-1-01,27140
+2004-2-01,28526
+2004-3-01,28845
+2004-4-01,25033
+2004-5-01,24764
+2004-6-01,24896
+2004-7-01,24623
+2004-8-01,26538
+2004-9-01,24674
+2004-10-01,25863
+2004-11-01,23156
+2004-12-01,22721
+2005-1-01,26204
+2005-2-01,26526
+2005-3-01,27473
+2005-4-01,24536
+2005-5-01,25764
+2005-6-01,25154
+2005-7-01,23729
+2005-8-01,25336
+2005-9-01,24649
+2005-10-01,25904
+2005-11-01,22868
+2005-12-01,21825
+2006-1-01,26955
+2006-2-01,27349
+2006-3-01,29367
+2006-4-01,24071
+2006-5-01,23173
+2006-6-01,21740
+2006-7-01,22056
+2006-8-01,23923
+2006-9-01,22189
+2006-10-01,22458
+2006-11-01,19476
+2006-12-01,21056
+2007-1-01,22842
+2007-2-01,22943
+2007-3-01,23445
+2007-4-01,19176
+2007-5-01,22058
+2007-6-01,19451
+2007-7-01,20378
+2007-8-01,21778
+2007-9-01,20431
+2007-10-01,22486
+2007-11-01,18889
+2007-12-01,18574
+2008-1-01,24658
+2008-2-01,24997
+2008-3-01,23987
+2008-4-01,21199
+2008-5-01,21205
+2008-6-01,21553
+2008-7-01,21166
+2008-8-01,20720
+2008-9-01,18067
+2008-10-01,21121
+2008-11-01,16617
+2008-12-01,15917
+2009-1-01,19262
+2009-2-01,20658
+2009-3-01,22660
+2009-4-01,20147
+2009-5-01,18818
+2009-6-01,19017
+2009-7-01,19451
+2009-8-01,18502
+2009-9-01,18893
+2009-10-01,19480
+2009-11-01,15743
+2009-12-01,16554
+2010-1-01,19465
+2010-2-01,22617
+2010-3-01,22438
+2010-4-01,18511
+2010-5-01,19142
+2010-6-01,19063
+2010-7-01,18006
+2010-8-01,20386
+2010-9-01,20280
+2010-10-01,18894
+2010-11-01,15941
+2010-12-01,16851
+2011-1-01,18204
+2011-2-01,15564
+2011-3-01,17984
+2011-4-01,13979
+2011-5-01,12591
+2011-6-01,12408
+2011-7-01,12779
+2011-8-01,14233
+2011-9-01,13410
+2011-10-01,12893
+2011-11-01,11843
+2011-12-01,11321
+2012-1-01,13427
+2012-2-01,14447
+2012-3-01,14717
+2012-4-01,11998
+2012-5-01,13379
+2012-6-01,12463
+2012-7-01,13276
+2012-8-01,14442
+2012-9-01,13422
+2012-10-01,13795
+2012-11-01,13352
+2012-12-01,12716
diff --git a/resources/sarimax_explanation/retail_sales.csv b/resources/sarimax_explanation/retail_sales.csv
new file mode 100644
index 0000000..0b5499c
--- /dev/null
+++ b/resources/sarimax_explanation/retail_sales.csv
@@ -0,0 +1,73 @@
+date,sales
+2009-10-01,338630
+2009-11-01,339386
+2009-12-01,400264
+2010-01-01,314640
+2010-02-01,311022
+2010-03-01,360819
+2010-04-01,356460
+2010-05-01,365713
+2010-06-01,358675
+2010-07-01,362027
+2010-08-01,362682
+2010-09-01,346069
+2010-10-01,355212
+2010-11-01,365809
+2010-12-01,426654
+2011-01-01,335608
+2011-02-01,337352
+2011-03-01,387092
+2011-04-01,380754
+2011-05-01,391970
+2011-06-01,388636
+2011-07-01,384600
+2011-08-01,394548
+2011-09-01,374895
+2011-10-01,379364
+2011-11-01,391081
+2011-12-01,451669
+2012-01-01,355058
+2012-02-01,372523
+2012-03-01,414275
+2012-04-01,393035
+2012-05-01,418648
+2012-06-01,400996
+2012-07-01,396020
+2012-08-01,417911
+2012-09-01,385597
+2012-10-01,399341
+2012-11-01,410992
+2012-12-01,461994
+2013-01-01,375537
+2013-02-01,373938
+2013-03-01,421638
+2013-04-01,408381
+2013-05-01,436985
+2013-06-01,414701
+2013-07-01,422357
+2013-08-01,434950
+2013-09-01,396199
+2013-10-01,415740
+2013-11-01,423611
+2013-12-01,477205
+2014-01-01,383399
+2014-02-01,380315
+2014-03-01,432806
+2014-04-01,431415
+2014-05-01,458822
+2014-06-01,433152
+2014-07-01,443005
+2014-08-01,450913
+2014-09-01,420871
+2014-10-01,437702
+2014-11-01,437910
+2014-12-01,501232
+2015-01-01,397252
+2015-02-01,386935
+2015-03-01,444110
+2015-04-01,438217
+2015-05-01,462615
+2015-06-01,448229
+2015-07-01,457710
+2015-08-01,456340
+2015-09-01,430917
diff --git a/sarimax_explanation.Rmd b/sarimax_explanation.Rmd
new file mode 100644
index 0000000..25f2f51
--- /dev/null
+++ b/sarimax_explanation.Rmd
@@ -0,0 +1,653 @@
+# Sarimax explanation
+
+Yishi Wang
+
+
+```{r, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+## Setup
+```{python}
+from pandas import read_csv
+from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
+from statsmodels.tsa.stattools import acf, pacf 
+from statsmodels.tsa.ar_model import AR
+from statsmodels.tsa.arima_model import ARIMA 
+from datetime import datetime, timedelta
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import warnings
+from sklearn.metrics import r2_score
+from statsmodels.tsa.arima_model import ARMA
+from scipy.stats import norm
+import statsmodels.api as sm
+import requests
+from io import BytesIO
+```
+```{python}
+matplotlib.rc('xtick', labelsize=20) 
+matplotlib.rc('ytick', labelsize=20) 
+plt.rcParams.update({'font.size': 22})
+warnings.filterwarnings('ignore')
+pd.plotting.register_matplotlib_converters()
+```
+## ACF (Autocorrelation fuction)
+
+Time-series data by definition are ordered by time. Therefore, there is extra information in it people could take advantage of. We use ACF to decide which past events have heavier impect on current data. 
+
+Let's look on raw data of air passengers per month:
+```{python}
+data = pd.read_csv('resources/sarimax_explanation/AirPassengers.csv').drop(['Month'],axis=1)#.head(100)
+data_a = data.to_numpy().T[0]
+plt.figure(figsize=(20,10))
+plt.xlabel("Months")
+plt.ylabel("Number of air passengers")
+plt.grid()
+plt.plot(data_a)
+```
+and its ACF plot:
+```{python}
+plt.rc("figure", figsize=(20,10))
+plt.figure(figsize=(20,10))
+plot_acf(data_a, lags=50)
+plt.xlabel("Months")
+plt.grid()
+plt.show()
+```
+What we can see here are bars and a horizontal cone. This cone pictures the confidence level (by default set to 95%). In other words, if the point is outside the cone (on white) we may say that with 95% probability it has a certain impact on values. If the bar is inside the cone (on blue) we may ignore this particular lag as most likely it is not relevant.
+
+the bars, which are vital here, show how much past data point impacts the following ones:
+
+1. The first point (with index 0) has a height 1. In fact the first one is always 1. It make sense as current value always explain fully current value.
+2. The second point is around 0.9 which means that following point (directly next one) is described in 90% by the previous value.
+3. Eleventh bar has a height of 0.75. This means that current data will impact data in 11 month by 75% (data resolution is monthly).
+
+## PACF(Partial autocorrelation function)
+In time series analysis, the partial autocorrelation function (PACF) gives the partial correlation of a stationary time series with its own lagged values, regressed the values of the time series at all shorter lags. It contrasts with the autocorrelation function, which does not control for other lags.
+
+This function plays an important role in data analysis aimed at identifying the extent of the lag in an autoregressive model. The use of this function was introduced as part of the Box–Jenkins approach to time series modelling, whereby plotting the partial autocorrelative functions one could determine the appropriate lags p in an AR (p) model or in an extended ARIMA (p,d,q) model.
+
+## AR(Autoregression) Model
+Autoregression modeling is a modeling technique used for time series data that assumes linear continuation of the series so that previous values in the time series can be used to predict futures values. Different from linear regression model, With the autoregression model, we use previous data points to predict future data point(s) but with multiple lag variables.
+
+An exmaple of AR model:
+$$
+y = a + b_1X_{t-1}+b_2X_{t-2}+b_3X_{t-3}
+$$
+where $a$, $b_1$, $b_2$ and $b_3$ are variables found during the training of the model and $X_{t-1}$, $X_{t-2}$ and $X_{t-3}$ are input variables at previous times within the data set and t is the lag value.
+
+To AR modeling, there are two assumptiopns:
+
+1. The previous time step(s) is useful in predicting the value at the next time step (dependance between values)
+2. The data is stationary. A time series is stationary if is mean (and/or variance) is constant over time. There are other statistical properties to look at as well, but looking at the mean is usually the fastest/easiest.
+
+Here is an exmaple, the exmaple uses retial sales data. 
+```{python}
+sales_data = pd.read_csv('resources/sarimax_explanation/retail_sales.csv')
+sales_data['date']=pd.to_datetime(sales_data['date'])
+sales_data.set_index('date', inplace=True)
+
+plt.figure(figsize=(20,10))
+plt.xlabel("Year")
+plt.ylabel("Sales")
+plt.grid()
+plt.title('Sales VS years')
+plt.plot(sales_data)
+```
+plot its ACF:
+ 
+```{python}
+
+plt.rc("figure", figsize=(20,10))
+pd.plotting.autocorrelation_plot(sales_data['sales'])
+plt.title("ACF")
+plt.show()
+```
+
+From the ACF plot above, we can see there’s some significant correlation between t=1 and t=12 (roughly) with significant decline in correlation after that timeframe.  Since we are looking at monthly sales data, this seems to make sense with correlations falling off at the start of the new fiscal year.
+
+From looking at above charts, we find that it is a non-stationary dataset due to the increasing trend from lower left to upper right.For simplicity, we don't test its trend in this document and use quick/dirty method of differencing to get a more stationary model:
+
+```{python}
+sales_data['stationary']=sales_data['sales'].diff()
+plt.figure(figsize=(20,10))
+plt.xlabel("Year")
+plt.ylabel("Sales difference")
+plt.grid()
+plt.plot(sales_data['stationary'])
+```
+ 
+
+ Since the data becomes stationary, we then apply AR modeling in **statsmodels.libtrary**.
+
+```{python}
+#create train/test datasets
+X = sales_data['stationary'].dropna()
+train_data = X[1:len(X)-12]
+test_data = X[len(X)-12:]
+
+#train the autoregression model
+model = AR(train_data)
+model_fitted = model.fit()
+```
+
+In the above, we are simply creating a testing and training dataset and then creating and fitting our AR() model. Once we've fit the model, we can look at the chosen lag and parameters of the model using some simple print statements:
+```{python}
+print('The lag value chose is: %s' % model_fitted.k_ar)
+```
+
+```{python}
+print('The coefficients of the model are:\n %s' % model_fitted.params)
+```
+If we look back at our autocorrelation plot, we can see that the lag value of 10 is where the line first touches the 95% confidence level, which is usually the way we’d select the lag value when we first run autoregression models if we were selecting things manually, so the selection makes sense.
+
+Then we can make some forecasts:
+```{python}
+plt.rc("figure", figsize=(20,10))
+# make predictions 
+predictions = model_fitted.predict(
+    start=len(train_data), 
+    end=len(train_data) + len(test_data)-1, 
+    dynamic=False)
+
+# create a comparison dataframe
+compare_df = pd.concat(
+    [sales_data['stationary'].tail(12),
+    predictions], axis=1).rename(
+    columns={'stationary': 'actual', 0:'predicted'})
+
+# plot the two values
+plt.grid()
+compare_df.plot()
+
+
+```
+The mean square error of the prediction is:
+```{python}
+r2_score(sales_data['stationary'].tail(12), predictions)
+```
+This gives us a root mean square value of 0.64, which isn’t terrible but there is room for improvement here.
+
+
+## MA(Moving Average)
+
+In statistics, a moving average (rolling average or running average) is a calculation to analyze data points by creating a series of averages of different subsets of the full data set. It is also called a moving mean (MM) or rolling mean and is a type of finite impulse response filter. Variations include: simple, and cumulative, or weighted forms.
+
+Given a series of numbers and a fixed subset size, the first element of the moving average is obtained by taking the average of the initial fixed subset of the number series. Then the subset is modified by "shifting forward"; that is, excluding the first number of the series and including the next value in the subset.
+
+A moving average is commonly used with time series data to smooth out short-term fluctuations and highlight longer-term trends or cycles.
+
+**Here is a example for applying MA:**
+
+Firstly, we generate some data using the equations below:
+$$
+y_t = 50 + 0.4 \epsilon_{t-1} + 0.3\epsilon_{t-2} + \epsilon_{t}
+$$
+$$
+\epsilon_t \sim N(0,1)
+$$
+
+```{python}
+errors = np.random.normal(0, 1, 400)
+```
+
+
+```{python}
+date_index = pd.date_range(start='9/1/2019', end='1/1/2020')
+```
+
+```{python}
+mu = 50
+series = []
+for t in range(1,len(date_index)+1):
+    series.append(mu + 0.4*errors[t-1] + 0.3*errors[t-2] + errors[t])
+```
+
+
+```{python}
+series = pd.Series(series, date_index)
+series = series.asfreq(pd.infer_freq(series.index))
+```
+
+
+```{python}
+plt.figure(figsize=(20,4))
+plt.plot(series)
+plt.axhline(mu, linestyle='--', color='grey')
+```
+```{python}
+def calc_corr(series, lag):
+    return pearsonr(series[:-lag], series[lag:])[0]
+```
+Draw its ACF graph:
+```{python}
+acf_vals = acf(series)
+num_lags = 10
+plt.figure(figsize=(10,4))
+plt.bar(range(num_lags), acf_vals[:num_lags])
+plt.show()
+```
+Its PACF graph:
+```{python}
+pacf_vals = pacf(series)
+num_lags = 25
+plt.figure(figsize=(10,4))
+plt.bar(range(num_lags), pacf_vals[:num_lags])
+plt.show()
+```
+Fit MA Model
+```{python}
+#get training and testing sets
+train_end = datetime(2019,12,30)
+test_end = datetime(2020,1,1)
+
+train_data = series[:train_end]
+test_data = series[train_end + timedelta(days=1):test_end]
+
+#Fit MA model
+model = ARIMA(train_data, order=(0,0,2))
+model_fit = model.fit()
+print(model_fit.summary())
+```
+
+According to the summary, we've got the predicted model:
+$$
+\hat y_t = 50 +0.37\epsilon_{t-1} + 0.25_
+{t-2}
+$$
+Use mode lto predict the data and calculate its error:
+```{python}
+#get prediction start and end dates
+pred_start_date = test_data.index[0]
+pred_end_date = test_data.index[-1]
+
+#get the predictions and residuals
+predictions = model_fit.predict(start=pred_start_date, end=pred_end_date)
+
+residuals = test_data - predictions
+
+#plot the prediction
+plt.figure(figsize=(20,4))
+
+plt.plot(series[-14:])
+plt.plot(predictions)
+
+plt.legend(('Data', 'Predictions'), fontsize=16)
+```
+
+```{python}
+print('Mean Absolute Percent Error:', round(np.mean(abs(residuals/test_data)),4))
+```
+
+
+```{python}
+print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2)))
+```
+## ARMA(Autoregressive–moving-average) model
+
+In the statistical analysis of time series, autoregressive–moving-average (ARMA) models provide a parsimonious description of a (weakly) stationary stochastic process in terms of two polynomials, one for the autoregression (AR) and the second for the moving average (MA). The general ARMA model was described in the 1951 thesis of Peter Whittle, Hypothesis testing in time series analysis, and it was popularized in the 1970 book by George E. P. Box and Gwilym Jenkins.
+
+Given a time series of data Xt , the ARMA model is a tool for understanding and, perhaps, predicting future values in this series. The AR part involves regressing the variable on its own lagged (i.e., past) values. The MA part involves modeling the error term as a linear combination of error terms occurring contemporaneously and at various times in the past. The model is usually referred to as the ARMA(p,q) model where p is the order of the AR part and q is the order of the MA part.
+
+Here is an exmaple using ARMA model to predict the catfish sales data following the similar steps of MA prediction above:
+```{python}
+
+def parser(s):
+  return datetime.strptime(s,'%Y-%m-%d')
+```
+```{python}
+#read data
+catfish_sales = pd.read_csv('resources/sarimax_explanation/catfish.csv', parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
+
+#infer the frequency of the data
+catfish_sales = catfish_sales.asfreq(pd.infer_freq(catfish_sales.index))
+
+start_date = datetime(2000,1,1)
+end_date = datetime(2004,1,1)
+lim_catfish_sales = catfish_sales[start_date:end_date]
+
+#plot the original data
+plt.figure(figsize=(20,4))
+plt.plot(lim_catfish_sales)
+plt.title('Catfish Sales in 1000s of Pounds', fontsize=20)
+plt.ylabel('Sales', fontsize=16)
+for year in range(start_date.year,end_date.year):
+    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)
+plt.axhline(lim_catfish_sales.mean(), color='r', alpha=0.2, linestyle='--')
+```
+ARMA model also requires the data to be stationary while our data here looks a little bit went upward as time went by. Just in case, we use the first difference:
+```{python}
+first_diff = lim_catfish_sales.diff()[1:]
+plt.figure(figsize=(20,4))
+plt.plot(first_diff)
+plt.title('First Difference of Catfish Sales', fontsize=20)
+plt.ylabel('Sales', fontsize=16)
+for year in range(start_date.year,end_date.year):
+    plt.axvline(pd.to_datetime(str(year)+'-01-01'), color='k', linestyle='--', alpha=0.2)
+plt.axhline(first_diff.mean(), color='r', alpha=0.2, linestyle='--')
+```
+**ACF**
+```{python}
+acf_vals = acf(first_diff)
+plt.figure(figsize=(10,4))
+plt.bar(range(num_lags), acf_vals[:num_lags])
+plt.show()
+```
+
+Based on ACF, we should start with a MA(1) process
+
+**PACF**
+
+```{python}
+num_lags = 15
+pacf_vals = pacf(first_diff, nlags=num_lags)
+plt.figure(figsize=(10,4))
+plt.bar(range(num_lags), pacf_vals[:num_lags])
+plt.show()
+```
+Based on PACF, we should start with a AR(4) process. Though lags after 11 seem have strong impacts, it may caused by seasonality, which we will discuss later and ignore for now.
+
+**training**
+```{python}
+#get training and testing sets
+train_end = datetime(2003,7,1)
+test_end = datetime(2004,1,1)
+
+train_data = first_diff[:train_end]
+test_data = first_diff[train_end + timedelta(days=1):test_end]
+
+
+# define model
+model = ARMA(train_data, order=(4,1))
+
+#fit the model
+model_fit = model.fit()
+
+```
+
+
+```{python}
+
+#summary of the model
+print(model_fit.summary())
+#get prediction start and end dates
+pred_start_date = test_data.index[0]
+pred_end_date = test_data.index[-1]
+
+#get the predictions and residuals
+predictions = model_fit.predict(start=pred_start_date, end=pred_end_date)
+residuals = test_data - predictions
+```
+So the ARMA(4,1) model is:
+$$
+\hat y_t = -0.87 y_{t-1} -0.42 y_{t-2} - 0.56y_{t-3} - 0.61y_{t-4} + 0.52\epsilon_{t-1}
+$$
+**prediction results**
+```{python}
+plt.figure(figsize=(20,4))
+plt.plot(residuals)
+plt.title('Residuals from AR Model', fontsize=20)
+plt.ylabel('Error', fontsize=16)
+plt.axhline(0, color='r', linestyle='--', alpha=0.2)
+```
+```{python}
+plt.figure(figsize=(20,4))
+
+plt.plot(test_data)
+plt.plot(predictions)
+
+plt.legend(('Data', 'Predictions'), fontsize=16)
+
+plt.title('First Difference of Catfish Sales', fontsize=20)
+plt.ylabel('Sales', fontsize=16)
+```
+```{python}
+print('Root Mean Squared Error:', np.sqrt(np.mean(residuals**2)))
+```
+## ARIMA(Autoregressive integrated moving average) Model
+ARIMA models are applied in some cases where data show evidence of non-stationarity, where an initial differencing step (corresponding to the "integrated" part of the model) can be applied one or more times to eliminate the non-stationarity.
+
+The I (for "integrated") indicates that the data values have been replaced with the difference between their values and the previous values (and this differencing process may have been performed more than once). 
+
+Here is an example using ARIMA model to fit wholesale price index(WPI) data, which is not stationary:
+
+```{python}
+plt.rc("figure", figsize=(16,8))
+plt.rc("font", size=14)
+```
+
+
+```{python}
+# Dataset
+wpi1 = requests.get('https://www.stata-press.com/data/r12/wpi1.dta').content
+data = pd.read_stata(BytesIO(wpi1))
+data.index = data.t
+# Set the frequency
+data.index.freq="QS-OCT"
+
+# Fit the model
+# Here we set integrated of order 1, so that the difference is assumed to be stationary, and fit a model with one autoregressive lag and one moving average lag, as well as an intercept term.
+mod = sm.tsa.statespace.SARIMAX(data['wpi'], trend='c', order=(1,1,1))
+res = mod.fit(disp=False)
+print(res.summary())
+```
+
+According to the summary, we have:
+$$
+\Delta y_t = 0.0943 + 0.8742 \Delta y_{t-1} - 0.4120\epsilon_{t-1} + \epsilon_t
+$$
+
+## SARIMA model
+This model is an extension of ARIMA model. The new part of this model is that there is allowed to be a seasonal effect. The second difference is that this model uses the log of the data rather than the level.
+
+```{python}
+# Dataset
+data = pd.read_stata(BytesIO(wpi1))
+data.index = data.t
+data.index.freq="QS-OCT"
+
+data['ln_wpi'] = np.log(data['wpi'])
+data['D.ln_wpi'] = data['ln_wpi'].diff()
+```
+
+
+```{python}
+# Graph data
+fig, axes = plt.subplots(1, 2, figsize=(20,4))
+
+# Levels
+axes[0].plot(data.index._mpl_repr(), data['wpi'], '-')
+axes[0].set(title='US Wholesale Price Index')
+
+# Log difference
+axes[1].plot(data.index._mpl_repr(), data['D.ln_wpi'], '-')
+axes[1].hlines(0, data.index[0], data.index[-1], 'r')
+axes[1].set(title='US Wholesale Price Index - difference of logs')
+plt.show()
+```
+```{python}
+# Graph data
+fig, axes = plt.subplots(1, 2, figsize=(20,4))
+
+fig = sm.graphics.tsa.plot_acf(data.iloc[1:]['D.ln_wpi'], lags=40, ax=axes[0])
+fig = sm.graphics.tsa.plot_pacf(data.iloc[1:]['D.ln_wpi'], lags=40, ax=axes[1])
+plt.show()
+```
+
+
+From the first two graphs, we note that the original time series does not appear to be stationary, whereas the first-difference does. This supports either estimating an ARMA model on the first-difference of the data, or estimating an ARIMA model with 1 order of integration (recall that we are taking the latter approach). The last two graphs support the use of an ARMA(1,1,1) model.
+
+What we want is a polynomial that has terms for the 1st and 4th degrees, but leaves out the 2nd and 3rd terms. To do that, we need to provide a tuple for the specification parameter, where the tuple describes the lag polynomial itself. In particular, here we would want to use:
+
+```{python}
+ar = 1          # this is the maximum degree specification
+ma = (1,0,0,1)  # this is the lag polynomial specification
+
+# Fit the model
+mod = sm.tsa.statespace.SARIMAX(data['ln_wpi'], trend='c', order=(ar,1,ma))
+res = mod.fit(disp=False)
+print(res.summary())
+```
+
+Alternatively, we can use $$ARIMA(p,d,q)\times (P,D,Q)_s$$, where the lowercase letters indicate the specification for the non-seasonal component, and the uppercase letters indicate the specification for the seasonal component; s is the periodicity of the seasons (e.g. it is often 4 for quarterly data or 12 for monthly data).
+
+Here we use airline data as an exmaple:
+```{python}
+# Dataset
+air2 = requests.get('https://www.stata-press.com/data/r12/air2.dta').content
+data = pd.read_stata(BytesIO(air2))
+data.index = pd.date_range(start=datetime(data.time[0], 1, 1), periods=len(data), freq='MS')
+data['lnair'] = np.log(data['air'])
+
+# Fit the model
+mod = sm.tsa.statespace.SARIMAX(data['lnair'], order=(2,1,0), seasonal_order=(1,1,0,12), simple_differencing=True)
+res = mod.fit(disp=False)
+print(res.summary())
+
+```
+Notice that here we used an additional argument simple_differencing=True. This controls how the order of integration is handled in ARIMA models. If simple_differencing=True, then the time series provided as endog is literally differenced and an ARMA model is fit to the resulting new time series. This implies that a number of initial periods are lost to the differencing process, however it may be necessary either to compare results to other packages (e.g. Stata’s arima always uses simple differencing) or if the seasonal periodicity is large.
+
+The default is simple_differencing=False, in which case the integration component is implemented as part of the state space formulation, and all of the original data can be used in estimation.
+
+## ARIMAX 
+
+This model demonstrates the use of explanatory variables (the X part of ARMAX). When exogenous regressors are included, the SARIMAX module uses the concept of “regression with SARIMA errors” (see http://robjhyndman.com/hyndsight/arimax/ for details of regression with ARIMA errors versus alternative specifications)
+
+Here is the example using ARIMAX:
+```{python}
+# Dataset
+friedman2 = requests.get('https://www.stata-press.com/data/r12/friedman2.dta').content
+data = pd.read_stata(BytesIO(friedman2))
+data.index = data.time
+data.index.freq = "QS-OCT"
+
+# Variables
+endog = data.loc['1959':'1981', 'consump']
+exog = sm.add_constant(data.loc['1959':'1981', 'm2'])
+
+# Fit the model
+mod = sm.tsa.statespace.SARIMAX(endog, exog, order=(1,0,1))
+res = mod.fit(disp=False)
+print(res.summary())
+
+```
+
+## ARIMA Postestimation
+
+Finally, here we describe some of the post-estimation capabilities of statsmodels’ SARIMAX.
+
+First, using the model from example, we estimate the parameters using data that excludes the last few observations (this is a little artificial as an example, but it allows considering performance of out-of-sample forecasting and facilitates comparison to Stata’s documentation).
+
+```{python}
+# Dataset
+raw = pd.read_stata(BytesIO(friedman2))
+raw.index = raw.time
+raw.index.freq = "QS-OCT"
+data = raw.loc[:'1981']
+
+# Variables
+endog = data.loc['1959':, 'consump']
+exog = sm.add_constant(data.loc['1959':, 'm2'])
+nobs = endog.shape[0]
+
+# Fit the model
+mod = sm.tsa.statespace.SARIMAX(endog.loc[:'1978-01-01'], exog=exog.loc[:'1978-01-01'], order=(1,0,1))
+fit_res = mod.fit(disp=False, maxiter=250)
+print(fit_res.summary())
+```
+Next, we want to get results for the full dataset but using the estimated parameters (on a subset of the data).
+```{python}
+mod = sm.tsa.statespace.SARIMAX(endog, exog=exog, order=(1,0,1))
+res = mod.filter(fit_res.params)
+```
+The predict command is first applied here to get in-sample predictions. We use the full_results=True argument to allow us to calculate confidence intervals (the default output of predict is just the predicted values).
+
+With no other arguments, predict returns the one-step-ahead in-sample predictions for the entire sample.
+
+```{python}
+# In-sample one-step-ahead predictions
+predict = res.get_prediction()
+predict_ci = predict.conf_int()
+```
+
+We can also get dynamic predictions. One-step-ahead prediction uses the true values of the endogenous values at each step to predict the next in-sample value. Dynamic predictions use one-step-ahead prediction up to some point in the dataset (specified by the dynamic argument); after that, the previous predicted endogenous values are used in place of the true endogenous values for each new predicted element.
+
+The dynamic argument is specified to be an offset relative to the start argument. If start is not specified, it is assumed to be 0.
+
+Here we perform dynamic prediction starting in the first quarter of 1978.
+
+```{python}
+# Dynamic predictions
+predict_dy = res.get_prediction(dynamic='1978-01-01')
+predict_dy_ci = predict_dy.conf_int()
+```
+
+We can graph the one-step-ahead and dynamic predictions (and the corresponding confidence intervals) to see their relative performance. Notice that up to the point where dynamic prediction begins (1978:Q1), the two are the same.
+
+```{python}
+# Graph
+fig, ax = plt.subplots(figsize=(20,4))
+npre = 4
+ax.set(title='Personal consumption', xlabel='Date', ylabel='Billions of dollars')
+
+# Plot data points
+data.loc['1977-07-01':, 'consump'].plot(ax=ax, style='o', label='Observed')
+
+# Plot predictions
+predict.predicted_mean.loc['1977-07-01':].plot(ax=ax, style='r--', label='One-step-ahead forecast')
+ci = predict_ci.loc['1977-07-01':]
+ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='r', alpha=0.1)
+predict_dy.predicted_mean.loc['1977-07-01':].plot(ax=ax, style='g', label='Dynamic forecast (1978)')
+ci = predict_dy_ci.loc['1977-07-01':]
+ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='g', alpha=0.1)
+
+legend = ax.legend(loc='lower right')
+plt.show()
+```
+
+Finally, graph the prediction error. It is obvious that, as one would suspect, one-step-ahead prediction is considerably better.
+
+```{python}
+# Prediction error
+
+# Graph
+fig, ax = plt.subplots(figsize=(20,4))
+npre = 4
+ax.set(title='Forecast error', xlabel='Date', ylabel='Forecast - Actual')
+
+# In-sample one-step-ahead predictions and 95% confidence intervals
+predict_error = predict.predicted_mean - endog
+predict_error.loc['1977-10-01':].plot(ax=ax, label='One-step-ahead forecast')
+ci = predict_ci.loc['1977-10-01':].copy()
+ci.iloc[:,0] -= endog.loc['1977-10-01':]
+ci.iloc[:,1] -= endog.loc['1977-10-01':]
+ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], alpha=0.1)
+
+# Dynamic predictions and 95% confidence intervals
+predict_dy_error = predict_dy.predicted_mean - endog
+predict_dy_error.loc['1977-10-01':].plot(ax=ax, style='r', label='Dynamic forecast (1978)')
+ci = predict_dy_ci.loc['1977-10-01':].copy()
+ci.iloc[:,0] -= endog.loc['1977-10-01':]
+ci.iloc[:,1] -= endog.loc['1977-10-01':]
+ax.fill_between(ci.index, ci.iloc[:,0], ci.iloc[:,1], color='r', alpha=0.1)
+
+legend = ax.legend(loc='lower left');
+legend.get_frame().set_facecolor('w')
+plt.show()
+```
+
+**Reference**
+
+1. [ACF] https://medium.com/@krzysztofdrelczuk/acf-autocorrelation-function-simple-explanation-with-python-example-492484c32711
+2. [PACF] https://en.wikipedia.org/wiki/Partial_autocorrelation_function
+3. [AR] https://pythondata.com/forecasting-time-series-autoregression/
+4. [MA] https://en.wikipedia.org/wiki/Moving_average
+5. [MA] https://github.com/ritvikmath/Time-Series-Analysis/blob/master/MA%20Model.ipynb
+6. [ARMA] https://github.com/ritvikmath/Time-Series-Analysis/blob/master/ARMA%20Model.ipynb
+7. [ARMA] https://en.wikipedia.org/wiki/Autoregressive%E2%80%93moving-average_model
+8. [ARIMA] https://en.wikipedia.org/wiki/Autoregressive_integrated_moving_average 
+9. [ARIMAX] https://www.statsmodels.org/dev/examples/notebooks/generated/statespace_sarimax_stata.html#
+
+

From fc26f37b1cf177ff9e6b11559d5c90899faa9072 Mon Sep 17 00:00:00 2001
From: lubin-liu <69323872+lubin-liu@users.noreply.github.com>
Date: Thu, 17 Dec 2020 16:28:41 -0500
Subject: [PATCH 2/2] Update _bookdown.yml

---
 _bookdown.yml | 86 +--------------------------------------------------
 1 file changed, 1 insertion(+), 85 deletions(-)

diff --git a/_bookdown.yml b/_bookdown.yml
index a87ec31..a361ddf 100644
--- a/_bookdown.yml
+++ b/_bookdown.yml
@@ -9,90 +9,6 @@ language:
 
 
 # Chapter ordering
-rmd_files: [
-  # Part: Introduction
-  'index.Rmd', # must be first chapter
-  'sample_project.Rmd',
-
-  # Part 2: Data Processing and Wrangling
-  'r_data_transformation.Rmd', # contains PART header
-  'data_preprocessing_tutorial.Rmd',
-  'apply_family.Rmd',
-  'using_sql_in_r.Rmd',
-
-  # Part 3: Data Visualization
-  'pairs_and_ggpairs.Rmd', # contains PART header
-  'a_brief_introduction_to_seaborn.Rmd',
-  'geommlbstadiums.Rmd',
-  'radar_chart.Rmd',
-  'likert_scale.Rmd',
-  'a_simple_way_to_visualize_geographic_data.Rmd',
-  'interactive_choropleth_map.Rmd',
-  'introduction_to_violin_plot.Rmd', # missing data file
-  'base_r_ggplot_graph.Rmd',
-
-  # Part: Results Reporting
-  'visualization_tools_in_real_work.Rmd',  # contains PART header
-  'color_in_r.Rmd',
-  #'visualization_nyc_covid19_rshiny.Rmd', #shiny doesn't work
-  'illustrate_graphs_in_r.Rmd',
-  'shiny_tutorial.Rmd',
-
-  # Part: Complete Analyses
-  'china_choropleth_map.Rmd', # contains PART header
-  'benfords_law.Rmd',
-  'visualizing_electoral_margins.Rmd',
-  'survial_analysis.Rmd',
-
-  # Part: Cheatsheets and Video Tutorials
-  'mosaic_plot_cheatsheet.Rmd',  # contains PART header
-  'add_ts_cheatsheet.Rmd',
-  'python_vs_r_cheatsheet.Rmd',
-  'likert_data_cheatsheet.Rmd',
-  'categorical_cheat_sheet.Rmd',
-  'replication_crisis_psychology.Rmd',
-  'tableau_data_story_tutorial.Rmd',
-
-  # Part: Live Tutorials
-  'tableau_intro_tutorial_demo_with_pset.Rmd', # contains PART header
-  'datapipeline_design.Rmd',
-  'project_pichon.Rmd',
-  'american_history_trivia_night.Rmd',
-  'consuming_terrorism_statistics.Rmd',
-  'introduction_to_network_analysis.Rmd',
-
-  # Part: Data Visualization
-  'label_setting_techniques.Rmd',
-
-  # Part: Translations
-  'chinese_translation_of_candela_package.Rmd', # contains PART header
-  'greek_edav_histogram.Rmd',
-  'korean_translation_of_scatterplot.Rmd',
-  'rmarkdown_tutorial_chinese_translation.Rmd',
-  'ggplot2_tutorial_translated.Rmd',
-  'chinese_tutorial_rvest_httr.Rmd',
-  'website_sharing_whole_class_mosaic_plot_translation.Rmd',
-  'multivariate_grouped_data_plot.Rmd',
-
-  # Part: Other Topics
-  'rselenium_tutorial.Rmd',  # contains PART header
-  'ggpubr_package.Rmd',
-  'geographical_map_packages.Rmd',
-  'health_datasets_for_the_final_project.Rmd',
-  'layout_for_baseplot_and_ggpplot.Rmd',
-  'a_basic_introduction_to_mcmc_in_r.Rmd',
-  'automate_eda_with_dataexplorer.Rmd',
-  'speed_up_in_r_programming.Rmd',
-  'data_visualization_in_python.Rmd',
-  'final_project_teammate_finder.Rmd',
-  'among_us_stats.Rmd',
-  'ipywidgets_tutorial.Rmd',
-  'linkedin_professional_development_session.Rmd',
-
-  # Part: Appendices
-  'appendix_initial_setup.Rmd', # contains PART header
-  'appendix_pull_request_tutorial.Rmd',
-  'appendix_chapter_organization.Rmd'
-]
+'sarimax_explanation.Rmd',
 
 before_chapter_script: "_common.R"