-
Notifications
You must be signed in to change notification settings - Fork 0
/
time_analysis.py
265 lines (215 loc) · 9.43 KB
/
time_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#IMPORTING PACKAGE
import pandas as pd
import numpy as np
import warnings #to ignore the warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from datetime import datetime#to access datetime
from pandas import Series#to work on series
from sklearn.metrics import mean_squared_error
from math import sqrt
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt
#READING TRAIN AND TEST DATA..
train = pd.read_csv("Train_SU63ISt.csv")
test = pd.read_csv("Test_0qrQsBZ.csv")
#MAKKING COPY SO THAAT WE CANNOT LOSSE THE DATASET IF WE MAKE CHANGES TO THESE DATASET..
train_original = train.copy()
test_original = test.copy()
train['Datetime'] = pd.to_datetime(train.Datetime,format='%d-%m-%Y %H:%M')
test['Datetime'] = pd.to_datetime(test.Datetime,format='%d-%m-%Y %H:%M')
test_original['Datetime'] = pd.to_datetime(test_original.Datetime,format='%d-%m-%Y %H:%M')
train_original['Datetime'] = pd.to_datetime(train_original.Datetime,format='%d-%m-%Y %H:%M')
#extracting year,month,day and hour from datetime to validate our hypothesis.
for i in (train, test, test_original, train_original):
i['year']=i.Datetime.dt.year
i['month']=i.Datetime.dt.month
i['day']=i.Datetime.dt.day
i['Hour']=i.Datetime.dt.hour
train['day of week'] = train['Datetime'].dt.dayofweek
temp = train['Datetime']
#assign 1 if day of week is weekend and 0 if week is not the weekend.
def applyer(row):
if row.dayofweek == 5 or row.dayofweek == 6:
return 1
else:
return 0
#this is the data we have collected from apply function
temp2 = train['Datetime'].apply(applyer)
train['weekend'] = temp2
#let's look at the time series.
train.index = train['Datetime']#indexing the time to get the time period on the x-axis
df = train.drop('ID',1)#drop ID variable to get only the Datetime on the x-axis.
ts = df['Count']#for plotting on y_axis
plt.figure(figsize=(16,8))
plt.plot(ts, label='Passenger Count')
plt.title('Time Series')
plt.xlabel("Time(year-month)")
plt.ylabel("Passenger Count")
plt.legend(loc="best")
plt.show()
train = train.drop('ID',1)
test.Timestamp = pd.to_datetime(test.Datetime,format='%d-%m-%Y %H:%M')
test.index = test.Timestamp
# Converting to daily mean
test = test.resample('D').mean()
train.Timestamp = pd.to_datetime(train.Datetime,format='%d-%m-%Y %H:%M')
train.index = train.Timestamp
# Converting to daily mean
train = train.resample('D').mean()
#SPLITTING OF DATA
Train = train.ix['2012-08-25':'2014-06-24']
#last 3 month as validation set.
valid = train.ix['2014-06-25':'2014-09-25']
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
#Determing rolling statistics
rm = pd.Series(timeseries)
rolmean = rm.rolling(window=24).mean() # 24 hours on each day
rstd = pd.Series(timeseries)
rolstd = rstd.rolling(window=24).std()
#Plot rolling statistics:
orig = plt.plot(timeseries, color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show(block=True)
plt.show()
#Perform Dickey-Fuller test:
print ('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key,value in dftest[4].items():
dfoutput['Critical Value (%s)'%key] = value
print (dfoutput)
from matplotlib.pylab import rcParams#this is use for handlig default values
rcParams['figure.figsize'] = 20,10
#till here we observe that there is some sort of trend
test_stationarity(train_original['Count'])
#now removing the trend in the time series graph by using log transformation
Train_log = np.log(Train['Count'])
valid_log = np.log(valid['Count'])
mavg = pd.Series(Train_log)
moving_avg = mavg.rolling(24).mean()
plt.plot(Train_log)
plt.plot(moving_avg, color='black')
plt.show()
#we will observe that there is an increasing trend in the graph
#so we will remove this increaing trend to make our time series more stationry
train_log_moving_avg_diff = Train_log - moving_avg
train_log_moving_avg_diff.dropna(inplace=True)
test_stationarity(train_log_moving_avg_diff)
train_log_diff = Train_log - Train_log.shift(1)
test_stationarity(train_log_diff.dropna())
#removing seasonality
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(pd.DataFrame(Train_log).Count.values, freq = 24)
#through decomposition technique we for m a residual part which we will use ot further.
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid
plt.subplot(411)
plt.plot(Train_log, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()
plt.show()
#checking stationarity of residuals
train_log_decompose = pd.DataFrame(residual)
train_log_decompose['date'] = Train_log.index
train_log_decompose.set_index('date', inplace = True)
train_log_decompose.dropna(inplace=True)
test_stationarity(train_log_decompose[0])
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(train_log_diff.dropna(), nlags=25)
lag_pacf = pacf(train_log_diff.dropna(), nlags=25, method='ols')
#ACF and PACF plot
plt.plot(lag_acf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(train_log_diff.dropna())),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(train_log_diff.dropna())),linestyle='--',color='gray')
plt.title('Autocorrelation Function')
plt.show()
plt.plot(lag_pacf)
plt.axhline(y=0,linestyle='--',color='gray')
plt.axhline(y=-1.96/np.sqrt(len(train_log_diff.dropna())),linestyle='--',color='gray')
plt.axhline(y=1.96/np.sqrt(len(train_log_diff.dropna())),linestyle='--',color='gray')
plt.title('Partial Autocorrelation Function')
plt.show()
#AR models
from statsmodels.tsa.arima_model import ARIMA
model = ARIMA(Train_log, order=(2, 1, 0)) # here the q value is zero since it is just the AR model
results_AR = model.fit(disp=-1)
plt.plot(train_log_diff.dropna(), label='original')
plt.plot(results_AR.fittedvalues, color='red', label='predictions')
plt.legend(loc='best')
plt.show()
#plotting validation curve for AR model
#for that we have to change the scale of the model to original scale
#first step to store the predicted result as a separate series and observe it.
AR_predict=results_AR.predict(start="2014-06-25", end="2014-09-25")
AR_predict=AR_predict.cumsum().shift().fillna(0)
AR_predict1=pd.Series(np.ones(valid.shape[0]) * np.log(valid['Count'])[0], index = valid.index)
AR_predict1=AR_predict1.add(AR_predict,fill_value=0)
AR_predict = np.exp(AR_predict1)
plt.plot(valid['Count'], label = "Valid")
plt.plot(AR_predict, color = 'red', label = "Predict")
plt.legend(loc= 'best')
plt.title('RMSE: %.4f'% (np.sqrt(np.dot(AR_predict, valid['Count']))/valid.shape[0]))
plt.show()
#the red line will show the prediction for the vqqalidation set...
#MA model moving-average model specifies that the output variable depends linearly on the current and various past values of a stochastic term
#stochastic means imperfectly predictable terms
model = ARIMA(Train_log, order=(0, 1, 2)) # here the p value is zero since it is just the MA model
results_MA = model.fit(disp=-1)
plt.plot(train_log_diff.dropna(), label='original')
plt.plot(results_MA.fittedvalues, color='red', label='prediction')
plt.legend(loc='best')
plt.show()
MA_predict=results_MA.predict(start="2014-06-25", end="2014-09-25")
MA_predict=MA_predict.cumsum().shift().fillna(0)
MA_predict1=pd.Series(np.ones(valid.shape[0]) * np.log(valid['Count'])[0], index = valid.index)
MA_predict1=MA_predict1.add(MA_predict,fill_value=0)
MA_predict = np.exp(MA_predict1)
plt.plot(valid['Count'], label = "Valid")
plt.plot(MA_predict, color = 'red', label = "Predict")
plt.legend(loc= 'best')
plt.title('RMSE: %.4f'% (np.sqrt(np.dot(MA_predict, valid['Count']))/valid.shape[0]))
plt.show()
#now let's combine these two models...
#COMBINED MODLES
model = ARIMA(Train_log, order=(2, 1, 2))
results_ARIMA = model.fit(disp=-1)
plt.plot(train_log_diff.dropna(), label='original')
plt.plot(results_ARIMA.fittedvalues, color='red', label='predicted')
plt.legend(loc='best')
plt.show()
#defining the model that change the scale of the model to original scale
def check_prediction_diff(predict_diff, given_set):
predict_diff= predict_diff.cumsum().shift().fillna(0)
predict_base = pd.Series(np.ones(given_set.shape[0]) * np.log(given_set['Count'])[0], index = given_set.index)
predict_log = predict_base.add(predict_diff,fill_value=0)
predict = np.exp(predict_log)
plt.plot(given_set['Count'], label = "Given set")
plt.plot(predict, color = 'red', label = "Predict")
plt.legend(loc= 'best')
plt.title('RMSE: %.4f'% (np.sqrt(np.dot(predict, given_set['Count']))/given_set.shape[0]))
plt.show()
def check_prediction_log(predict_log, given_set):
predict = np.exp(predict_log)
plt.plot(given_set['Count'], label = "Given set")
plt.plot(predict, color = 'red', label = "Predict")
plt.legend(loc= 'best')
plt.title('RMSE: %.4f'% (np.sqrt(np.dot(predict, given_set['Count']))/given_set.shape[0]))
plt.show()
#let's predict the vaue for the validation set
ARIMA_predict_diff=results_ARIMA.predict(start="2014-06-25", end="2014-09-25")
check_prediction_diff(ARIMA_predict_diff, valid)