-
Notifications
You must be signed in to change notification settings - Fork 3
/
3A.IBM Watson Marketing AB Test - ANOVA model.R
252 lines (177 loc) · 7.77 KB
/
3A.IBM Watson Marketing AB Test - ANOVA model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
############# Marketing A/B Test ######
#Dataset from IBM Watson Community
#####load libraries ######
library(multcomp)
library(tidyverse)
library(cowplot)
library(VIM)
###### Data Import & Data Exploration #############
df<-read.csv("https://raw.githubusercontent.com/pthiagu2/DataMining/master/WA_Fn-UseC_-Marketing-Campaign-Eff-UseC_-FastF.csv")
head(df)
#check for missing data using VIM package
aggr(df, prop = F, numbers = T) # no red - no missing values
#summary sales statistics
grouped.df <- df %>%
group_by(Promotion) %>%
summarize(
count = n(),
totalSales = sum(SalesInThousands),
meanSales = mean(SalesInThousands),
sd = sd(SalesInThousands))
#see stats
grouped.df
#We can see that group 3 created the most sales
#followed by groups 1 & 2
#We can also see that there were 172 stores that were in promotion 1 while there
#Were 188 stores in promotion 2. This is technically not balanced, but nearly-balanced.
#As long as we have equal variances in our groups, this shouldn't be a problem.
# Box plots
# ++++++++++++++++++++
# Visualize means of sales
library("ggpubr")
ggboxplot(df, x = "Promotion", y = "SalesInThousands",
color = "Promotion", palette = c("#00AFBB", "#E7B800", "#FC4E07"),
ylab = "Sales", xlab = "Promotion")
#We see that promotion 1 has the most average sales
#followed by promotion 3 just like in our summary statistics table
############Data Visualization #############
#Let's check and explore the stores
viz_1 <- ggplot(df, aes(x=MarketSize))+
geom_bar(stat="count", width=0.7)+
theme_minimal()
#Create a subset of data
#market size by promotion data frame
market_df <- df %>%
group_by(Promotion) %>%
count(MarketSize) %>%
mutate(percent = n/sum(n))
market_df #check results
viz_2 <- ggplot(data = df, aes(x=MarketSize, fill = factor(Promotion))) +
geom_bar(position = "fill") +
theme(
plot.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
plot.title = element_text(hjust = 0.5),
text=element_text(size=14, family="Helvetica")) + labs(x = " ", title = "MarketSize")
viz_3 <- ggplot(data = df, aes(x = factor(Promotion), y = LocationID, fill = Promotion))+geom_boxplot()
viz_4 <- ggplot(data = df, aes(x = factor(Promotion), y = AgeOfStore, fill = Promotion))+geom_boxplot()
viz_5 <- ggplot(data=df, aes(x=factor(AgeOfStore)))+
geom_bar(stat="count", width=0.7, fill="#b2df8a")+
theme_minimal()
#there is a bit of a delay before plots appear
plot_grid(viz_1, viz_2, viz_3, viz_4, viz_5, labels = "AUTO")
######### Data Cleaning ###########
#check the promotion variable
str(df$Promotion) #an integer object, we need to change this
#factor the promotion variable before we model it
df$Promotion <- as.factor(df$Promotion)
#check results
str(df$Promotion)
###### Data Question ###############
#Does store sales differ by promotion?
#Compute the mean sales using aggregate function
aggregate(SalesInThousands ~ Promotion, df, mean)
#promotion 1 has the highest level of sales, but
# is it statistically significant?
######## Significance Testing - ANOVA #############
#promotion 1 has the highest mean of sales, but is it statistically significant?
#We perform a one-way ANOVA
df.anova <- aov(SalesInThousands ~ Promotion, data = df)
summary(df.anova)
#We plot the ANOVA model to visualize confidence
#intervals for mean sales by promotion
# 2. Normality plot - check if distribution is normal
plot(df.anova, 2)
# Check normality assumption by analyzing the model residuals
# Create a QQ plot of residuals
qqplot(residuals(df.anova))
#There is a bit of skew in the right tail
#This has a bit more skew than what we would normally expect,
#but we will address this later on in detail
# Lets plot the residuals in a histogram
hist(residuals(df.anova))
#there is a slight right skew in the distribution
tally(~Promotion, data = df)
#Conclusions and interpretation:
#We see that the sales differs by Promotion,
#and the model is statistically significant
#but we don't know which pair groups are significant
#How can we change this?? We need to perform additional testing
###### Method 1 Multiple comparisons using multcomp package #############
#Use glht() to perform multiple pairwise-comparisons for
# a one-way ANOVA: (with confidence interval and p-values)
summary(glht(df.anova, linfct = mcp(Promotion = "Tukey")))
#group 2 is significant against group 1
#group 3 is significant against group 2
TukeyHSD(aov(df.anova), "Promotion") #does same as glht function but includes the
#confidence intervals
# plot difference in mean levels of promotion
plot(TukeyHSD(df.anova))
#Tukey comparison of means - much better and has confidence intervals
a1 <- aov(formula = df$SalesInThousands ~ df$Promotion)
plot(a1) # plots to check normality
#Post hoc testing
posthoc <- TukeyHSD(x=a1, conf.level = 0.95)
posthoc
#plot means
library(gplots)
plotmeans(SalesInThousands ~ Promotion, data = df,
frame = FALSE, connect = TRUE, mean.labels = TRUE,
digits = 2, col=" dark red",
barwidth=2, pch = " ",
main = "Groups 1 & 2 and 3 & 2 are significant")
###### Method 2: Change ANOVA equation to remove intercept term #############
df.anova2 <- aov(SalesInThousands ~ -1 + Promotion, data = df)
glht(df.anova2)
#With intercept removed, glht gives us the mean value for each segment
# Now we plot difference in mean levels of promotion
plot(glht(df.anova2), xlab="Average sales by promotion (95% CI)")
#The dot shows the mean for each segment and bars reflect the confidence intervals.
#With all 3 plotted with confidence intervals, Promo 2 is significantly worse
#than Promo 1 and 3, but we cannot say that Promo 1 and 3 are significant as their
#confidence intervals overlap.
####### Check ANOVA Assumptions ############
#I'm doing this as the residuals were a bit skewed
#1. Homogeneity of variances
plot(a1) #first anova model
#Looks good
#2. Levene's test for non-normal distribution - we check due to skew in residuals
library(car)
leveneTest(SalesInThousands ~ Promotion, data = df)
#We see that the p-value for Promotion 2 is large and therefore not significant.
#So we are good here. Promotion 2 is still not significant.
#3. Shapiro-Wilk (Has better power than Kolmogorov-Smirnoff (K-S) test)
# Extract the residuals
aov_residuals <- residuals(object = a1)
# Run Shapiro-Wilk test
shapiro.test(x = aov_residuals )
#We reject null hypothesis that residuals are normally distributed
#4. Kruskal-Wallis
#Non-parametric alternative to ANOVA
# It’s recommended when the assumptions of one-way ANOVA test are not
# met. One of those assumptions are that the residuals are normally
# distributed
kruskal.test(SalesInThousands ~ Promotion, data = df)
#The p-value is tiny; therefore, we can reject null hypothesis that there
# are no differences in group means, but we don't know which groups.
#5. We do pairwise comparisons and adjust for multiple groups
pairwise.wilcox.test(df$SalesInThousands, df$Promotion,
p.adjust.method = "bonferroni", paired = FALSE)
#This validates what we have done above with original anova model
#Our conclusions from are original findings are still valid
#most likely due to having a very large sample size to make
#the group comparisons.
######### Final Summary: What should you tell the marketing & sales team? ############
#Let's run again with just promotion 1 & 3 to
#see if we can get a significant result. The test
#should not take as long to run as we only have
#2 groups to compare so we could see significant
#results quite fast.
# Having a proper control group for comparison to be able
# to calculate the impact of the promotions
#It appeared in group 1 there were some stores that were slightly
#younger than those in Group 3
#it may not have made a difference
#but we should try to control for this.