-
Notifications
You must be signed in to change notification settings - Fork 0
K Means Clustering Example Script
kirkvanacore edited this page May 24, 2021
·
1 revision
#### K- Means Clustering #####
### Isolate variables ###
clusterdata_wID <- school[, c("school_id.x",
"sch_avg_rsp",
"sch_avg_weeks_of_use",
"sch_sdev_weeks_of_use",
"pup_users_saturation"
)]
clusterdata_wID <- clusterdata_wID[ which(complete.cases(clusterdata_wID) == T), ] # remove cases with missing data
clusterdata <- school[, c(
"sch_avg_rsp",
"sch_avg_weeks_of_use",
"sch_sdev_weeks_of_use",
"pup_users_saturation"
)]
clusterdata <- clusterdata[ which(complete.cases(clusterdata) == T), ] # remove cases with missing data
# convert everything to numric
clusterdata <- data.frame(lapply(clusterdata, function(x) as.numeric(as.character(x))))
#standardize Scores
clusterdata <- psycho::standardize(clusterdata)
### clustering ###
#estsimate optimal number of clusters -> 3 or 4
fviz_nbclust(clusterdata, kmeans, method = "silhouette")+
geom_vline(xintercept = 3, linetype = 2)
clusters <- kmeans(clusterdata, 3)
clusterdata$cluster3 <- clusters$cluster
clusters <- kmeans(clusterdata, 4)
clusterdata$cluster4 <- clusters$cluster
clusterdata_wID$cluster4 <- clusters$cluster
### Name Clusters ####
clusterdata_wID$cluster_names <- ifelse(clusterdata_wID$cluster4 == 1, "Low Perfoming/Low Usage",
ifelse(clusterdata_wID$cluster4 == 2, "High Perfoming/Low Usage",
ifelse(clusterdata_wID$cluster4 == 3, "School Wide",
"Intervention")))
clusterdata_wID$cluster_names <- factor(clusterdata_wID$cluster_names, levels = c("Low Perfoming/Low Usage",
"High Perfoming/Low Usage",
"Intervention",
"School Wide"))
table(clusterdata_wID$cluster_names, clusterdata_wID$cluster4)
colnames(clusterdata_wID)
###### Create boxplots for Clusters ####
p1<-ggplot(clusterdata_wID, aes(y=sch_avg_rsp, x=as.factor(cluster_names), color =cluster_names )) +
() +
ggtitle("Average Reading Success Probability")+
xlab("Implementation ") +
ylab("School Avg RSPs") +
theme_minimal() +
scale_color_discrete(name="Implementation Type")
get_legend<-function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
leg <- get_legend(p1)
p1<-p1+ theme(legend.position="none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
p2<-ggplot(clusterdata_wID, aes(y=sch_avg_weeks_of_use, x=as.factor(cluster_names), color =cluster_names )) +
geom_boxplot() +
ggtitle("Average Weeks of Use")+
xlab("Implementation") +
ylab("Avg Weeks of Us") +
theme_minimal()+
theme(legend.position="none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
p3<-ggplot(clusterdata_wID, aes(y=sch_sdev_weeks_of_use, x=as.factor(cluster_names), color =cluster_names )) +
geom_boxplot()+
ggtitle("Varaince Weeks of Use")+
xlab("Implementation") +
ylab("Stand Dev of Weeks of Us") +
theme_minimal()+
theme(legend.position="none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
p4<-ggplot(clusterdata_wID, aes(y=pup_users_saturation, x=as.factor(cluster_names), color =cluster_names )) +
geom_boxplot() +
ggtitle("PowerUp Saturation")+
xlab("Implementation") +
ylab("% School Using Powerup") +
theme_minimal() +
theme(legend.position="none",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
grid.arrange(p1,p2,p4,leg,
top =textGrob("PowerUp Implementation Types",gp=gpar(fontsize=20,font=1)) ,
nrow=2)
# add culsters back to school Data frame
clusterdata_wID <- clusterdata_wID[,c("")]