Skip to content

K Means Clustering Example Script

kirkvanacore edited this page May 24, 2021 · 1 revision

#### K- Means Clustering #####


### Isolate variables ###
clusterdata_wID <- school[, c("school_id.x",
                         "sch_avg_rsp",
                         "sch_avg_weeks_of_use",
                         "sch_sdev_weeks_of_use",
                         "pup_users_saturation"
                         )]
clusterdata_wID <- clusterdata_wID[ which(complete.cases(clusterdata_wID) == T), ] # remove cases with missing data

clusterdata <- school[, c(
                          "sch_avg_rsp",
                          "sch_avg_weeks_of_use",
                          "sch_sdev_weeks_of_use",
                          "pup_users_saturation"
)]

clusterdata <- clusterdata[ which(complete.cases(clusterdata) == T), ] # remove cases with missing data

# convert everything to numric
clusterdata <- data.frame(lapply(clusterdata, function(x) as.numeric(as.character(x))))
#standardize Scores
clusterdata <-   psycho::standardize(clusterdata)


### clustering ###
#estsimate optimal number of clusters -> 3 or 4
fviz_nbclust(clusterdata, kmeans, method = "silhouette")+
  geom_vline(xintercept = 3, linetype = 2)

clusters <- kmeans(clusterdata, 3)
clusterdata$cluster3 <- clusters$cluster
clusters <- kmeans(clusterdata, 4)
clusterdata$cluster4 <- clusters$cluster
clusterdata_wID$cluster4 <- clusters$cluster



### Name Clusters ####
clusterdata_wID$cluster_names <- ifelse(clusterdata_wID$cluster4 == 1, "Low Perfoming/Low Usage",
                                    ifelse(clusterdata_wID$cluster4 == 2, "High Perfoming/Low Usage",
                                           ifelse(clusterdata_wID$cluster4 == 3, "School Wide",
                                                  "Intervention")))
clusterdata_wID$cluster_names <- factor(clusterdata_wID$cluster_names, levels = c("Low Perfoming/Low Usage",
                                                                                  "High Perfoming/Low Usage",
                                                                                  "Intervention",
                                                                                  "School Wide"))
table(clusterdata_wID$cluster_names, clusterdata_wID$cluster4)
colnames(clusterdata_wID)

###### Create boxplots for Clusters ####
p1<-ggplot(clusterdata_wID, aes(y=sch_avg_rsp, x=as.factor(cluster_names), color =cluster_names )) +
  () +
  ggtitle("Average Reading Success Probability")+
  xlab("Implementation ") +
  ylab("School Avg RSPs") +   
  theme_minimal() +
  scale_color_discrete(name="Implementation Type")
        get_legend<-function(myggplot){
          tmp <- ggplot_gtable(ggplot_build(myggplot))
          leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
          legend <- tmp$grobs[[leg]]
          return(legend)
        }

leg <- get_legend(p1)
 p1<-p1+ theme(legend.position="none",
               axis.title.x=element_blank(),
               axis.text.x=element_blank(),
               axis.ticks.x=element_blank())
p2<-ggplot(clusterdata_wID, aes(y=sch_avg_weeks_of_use, x=as.factor(cluster_names), color =cluster_names )) +
  geom_boxplot() +
  ggtitle("Average Weeks of Use")+
  xlab("Implementation") +
  ylab("Avg Weeks of Us") +   
  theme_minimal()+
  theme(legend.position="none",
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
p3<-ggplot(clusterdata_wID, aes(y=sch_sdev_weeks_of_use, x=as.factor(cluster_names), color =cluster_names )) +
  geom_boxplot()+
  ggtitle("Varaince Weeks of Use")+
  xlab("Implementation") +
  ylab("Stand Dev of Weeks of Us") +   
  theme_minimal()+ 
  theme(legend.position="none",
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

p4<-ggplot(clusterdata_wID, aes(y=pup_users_saturation, x=as.factor(cluster_names), color =cluster_names )) +
  geom_boxplot() +
  ggtitle("PowerUp Saturation")+
  xlab("Implementation") +
  ylab("% School Using Powerup") +
  theme_minimal() +
  theme(legend.position="none",
        axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())



grid.arrange(p1,p2,p4,leg,
             top =textGrob("PowerUp Implementation Types",gp=gpar(fontsize=20,font=1)) ,
             nrow=2)
# add culsters back to school Data frame
clusterdata_wID <- clusterdata_wID[,c("")]

Clone this wiki locally