#########################################################################################
# ACKNOWLEDGEMENTS
# this example is based on the following articles and code samples:
# http://rstudio-pubs-static.s3.amazonaws.com/5983_af66eca6775f4528a72b8e243a6ecf2d.html
# http://thinktostart.com/cluster-twitter-data-with-r-and-k-means/
# http://www.r-statistics.com/2013/08/k-means-clustering-from-r-in-action/
#########################################################################################

tw.data <- read.csv(file = "data/twitter/chi2015_userdata.csv", stringsAsFactors = F)
str(tw.data)

##################################
# EXAMINING AND PREPARING THE DATA
##################################

library(ggplot2)
## Get an initial insight into the data
## simple scatter plot of users' data, by the number of followers and those they follow
gg1 <- ggplot(data=tw.data, aes(x=Following, y=Followers))
gg1 <- gg1 + layer(geom="point")
gg1 <- gg1 + xlab("Following Count")
gg1 <- gg1 + ylab("Followers Count")
gg1 <- gg1 + ggtitle("Twitter users tweeting about chi2015")
gg1

## we can observe a couple of outliers here (= Twitter users with very high 
## number of Followers and/or very high number of Following); we might want to 
## take a closer look:
summary(tw.data)

## in situations like this - when outliers are present - we might want to remove them
## before proceeding with the clustering; k-means tends to perform poorly when outliers  
## are present; so, it would be good to remove them
## let's first detect outliers
with.max.followers <- which(tw.data$Followers == max(tw.data$Followers))
tw.data[with.max.followers,]
with.max.following <- which(tw.data$Following == max(tw.data$Following))
tw.data[with.max.following,]
## now, remove them
tw.data <- tw.data[-c(with.max.following, with.max.followers),]
## check the data again
summary(tw.data)

## The graph we saw initially was not particularly useful... 
## In situations like this, when we have a small number of instances with very large attribute
## values and a large number of instances with small attribute values (power law distribution), 
## log transformation of the attribute is typically done; it offers a better insight into the data
## we'll create two new attributes in tw.data by log transforming the Following and Followers attributes
## 1 is added to eliminate any potential undefined log(0) values
tw.data$FollowingLog <- log( tw.data$Following + 1)
tw.data$FollowersLog <- log( tw.data$Followers + 1)
str(tw.data)
summary(tw.data$FollowingLog)
summary(tw.data$FollowersLog)

# Let's plot the two new variables
gg1.log <- ggplot(data=tw.data, aes(x=FollowingLog, y=FollowersLog))
gg1.log <- gg1.log + layer(geom="point")
gg1.log <- gg1.log + xlab("Log Following Count")
gg1.log <- gg1.log + ylab("Log Followers Count")
gg1.log <- gg1.log + ggtitle("Twitter users tweeting about chi2015 (Log Scale)")
gg1.log

# No obvious pattern in the data, but let's run k-means and see if some clusters will emerge

############################
# CLUSTERING WITH 2 FEATURES
############################

# create a subset of the original data containing the vectors to be used in the K-Means
cl.data <- tw.data[, c("FollowersLog", "FollowingLog")]
str(cl.data)

set.seed(123)
# run the K Means algorithm, specifying, for example, 4 centers
tw.4k <- kmeans(x = cl.data, centers=4, iter.max=20, nstart=1000)
## <! explain the meaning of iter.max (a 'safenet' in case of slow-to-converge cases) 
## and nstart (multiple instatiations) !> 

# let's inspect the results
tw.4k

## <! explain clustering evaluation metrics:
# - within_SS - within cluster sum of squares, i.e., sum of squared differences between individual
#   data points in a cluster and the cluster center; it is computed for each cluster
# - total_SS: the sum of squared differences of each data point to the global sample mean
# - between_SS: the sum of squared differences of each cluster center to the global sample mean
#   (when computing this value, the squared difference of each cluster center to the global 
#   sample mean is multiplied by the number of data points in that cluster)
# - between_SS / totat_SS: this ratio indicates how 'well' the sample splits into clusters;
#   the higher the ratio, the better clustering
## !>

# add the vector of clusters back to the data frame
cl.data$cluster <- tw.4k$cluster
head(cl.data)

# color the points by their respective clusters
gg1.log <- gg1.log + aes(color=tw.4k$cluster)
gg1.log <- gg1.log + scale_colour_gradient(low='blue', high='red')
gg1.log
# add cluster centers
gg1.log + geom_point(data=as.data.frame(tw.4k$centers),
                     colour="black",size=5, shape=17)


# Let's try having 5 centers and see what happens
cl.data <- cl.data[, c(1,2)]  # remove the column with clusters
tw.5k <- kmeans(x = cl.data, centers=5, iter.max=10, nstart=100)
tw.5k

cl.data$cluster <- tw.5k$cluster
gg1.log <- gg1.log + aes(color=cl.data$cluster)
#gg1.log <- gg1.log + scale_colour_gradient(low='yellow', high='brown')
gg1.log

# the plots allow us to visualy inspect the clusters, but it is better to have a more 
# systematic approach to judging the quality of clustering and selecting the best value for K

#################################
# SELECTING THE BEST VALUE FOR K
#################################

# SELECTING K USING THE ELBOW METHOD
# instead of guessing the correct value for K, we can take a more 
# systematic approach to choosing the 'right' K; it is called the Elbow method, and
# is based on the sum of squared differences between data points and cluster centres,
# i.e., the sum of within_SS for all the clusters (tot.withinss) 

# let's compute the metric required for the Elbow method (tot.withinss);
# along the way, we'll also compute the other metric: ratio of between_SS and totat_SS
eval.metrics.2var <- data.frame()
cl.data <- cl.data[, c(1,2)] # remove the column with clusters

# run kmeans for all K values in the range 2:10
for(k in 2:10){
  km.res <- kmeans(x=cl.data, centers=k, iter.max=20, nstart = 1000)
  #combine cluster number and the error measure, write to df
  eval.metrics.2var <- rbind(eval.metrics.2var, 
                             c(k, km.res$tot.withinss, km.res$betweenss/km.res$totss)) 
}
names(eval.metrics.2var) <- c("cluster", "tot.within.ss", "ratio")
eval.metrics.2var

# draw the Elbow plot
require(ggplot2)
ggplot(data=eval.metrics.2var, aes(x=cluster, y=tot.within.ss, group=1)) + 
  theme_bw(base_family="Garamond") + 
  geom_line(colour = "darkgreen") +
  theme(text = element_text(size=20)) +
  ggtitle("Reduction in error for different values of K\n") +
  xlab("\nClusters") + 
  ylab("Total Within Cluster Sum of Squares\n") +
  scale_x_continuous(breaks=seq(from=0, to=10, by=1))
## it seams that K=3 or K=4 would be the best options for the number of clusters

## if it is not fully clear from the plot where we have significant decrease in the 
## tot.within.ss value, we can compute the difference between each two subsequent values
eval.metrics.2var$tot.within.ss.delta <- compute.difference(eval.metrics.2var$tot.within.ss)
eval.metrics.2var$ratio.delta <- compute.difference(eval.metrics.2var$ratio)
eval.metrics.2var

## USE R's SUPPORT FOR SELECTING THE BEST VALUE FOR K
## there is also an R function that checks numerous of criteria (up to 30) to 
## determine the best number of clusters
# install.packages("NbClust")
# require(NbClust)
# nc <- NbClust(data = cl.data, min.nc = 2, max.nc = 6,  method = "kmeans")
# ## number of criteria that 'voted' for certain number of clusters
# table(nc$Best.n[1,])

## let's examine the solution with K=3
tw.3k <- kmeans(x = cl.data, centers=3, iter.max=20, nstart=1000)
## first, plot it
gg1.log <- gg1.log + aes(color=tw.3k$cluster)
gg1.log <- gg1.log + scale_colour_gradient(low='yellow', high='brown',
                                           name='Clusters',
                                           breaks=unique(tw.3k$cluster))
gg1.log
# add cluster centers
gg1.log + geom_point(data=as.data.frame(tw.3k$centers),
                     colour="black",size=5, shape=17)


## next, we examine clusters closer by looking into the cluster centers (mean) 
## and standard deviation from the centers 
sum.stats <- summary.stats(feature.set = cl.data, 
                           clusters = tw.3k$cluster, cl.num = 3)
# this is for generating nice table layout
# install.packages("knitr")
library(knitr)
kable(x = sum.stats, format = "rst")


###############################
# CLUSTERING WITH MORE FEATURES
###############################

# we often do clustering on more than two variables; 
# let's add additional variables we have about the users
str(tw.data)
summary(tw.data[,c(4,5)])
tw.data$TweetsLog <- log( tw.data$Tweets + 1)
tw.data$FavoritesLog <- log( tw.data$Favorites + 1)

# create a subset of data to be used for clustering
cl.data <- tw.data[, c(8:11)]
str(cl.data)
summary(cl.data)
# since all 4 attributes have roughly equal value ranges we do not have to scale them
# if that was not the case, scaling would be required

# instead of guessing K, we'll righ away use the Elbow method to find the optimal value for K
eval.metrics.4var <- data.frame()
for(k in 2:10){
  km.res <- kmeans(x=cl.data, centers=k, iter.max=15, nstart = 1000)
  #combine cluster number and the error measure, write to df
  eval.metrics.4var <- rbind(eval.metrics.4var, 
                             c(k, km.res$tot.withinss, km.res$betweenss/km.res$totss)) 
}
names(eval.metrics.4var) <- c("cluster", "tot.within.ss", "ratio")
eval.metrics.4var

# draw the Elbow plot
ggplot(data=eval.metrics.4var, aes(x=cluster, y=tot.within.ss, group=1)) + 
  theme_bw(base_family="Garamond") + 
  geom_line(colour = "darkgreen") +
  theme(text = element_text(size=20)) +
  ggtitle("Reduction in error for different values of K\n") +
  xlab("\nClusters") + 
  ylab("Total Within Cluster Sum of Squares\n") +
  scale_x_continuous(breaks=seq(from=0, to=10, by=1))

## this time it seams that the solution with either 3 or 6 clusters would be the best
## we'll examine both solutions

# ## let's check with the NbClust f.
# nc <- NbClust(data = cl.data, min.nc = 2, max.nc = 6, method = "kmeans")
# ## number of criteria that 'voted' for certain number of clusters
# table(nc$Best.n[1,])

## first, the clustring for k=3
tw.3k.4var <- kmeans(x=cl.data, centers=3, iter.max=15, nstart = 1000)
sum.stats <- summary.stats(feature.set = cl.data, 
                           clusters = tw.3k.4var$cluster, cl.num = 3)
kable(x = sum.stats, format = "rst")

## now, the one for k=6
tw.6k.4var <- kmeans(x=cl.data, centers=6, iter.max=15, nstart = 1000)
sum.stats <- summary.stats(feature.set = cl.data, 
                           clusters = tw.6k.4var$cluster, cl.num = 6)
kable(x = sum.stats, format = "rst")

##############################################
# COMPARISON OF DIFFERENT CLUSTERING SOLUTIONS
##############################################

## now we have a couple of possible clustring solutions, but the question is which 
## is the best one: the clustering model built with 2 or 4 variables, 3 or 6 clusters? 
## let's compare the clustering results produced by these different models
## to that end we'll use the cluster.stats() f. (from fpc package) - it allows for
## comparing two or more clustering solutions, regardless of the clustering method
## applied to obtain those clusters
install.packages("fpc")
library(fpc)

## let's define the criteria to be used for comparison
comparison.criteria <- c("max.diameter", "min.separation", "average.between", 
                         "average.within", "within.cluster.ss")  
d <- dist(x = cl.data)
comparison <- sapply(list(km.3k.2var=tw.3k$cluster, # model with 2 variables and 3 clusters
                          km.3k.4var=tw.3k.4var$cluster, # model with 4 variables and 3 clusters
                          km.6k.4var=tw.6k.4var$cluster), # model with 4 variables and 6 clusters
                    FUN=function(x)
                          cluster.stats(d, x))[comparison.criteria,]
kable(x = comparison, format = "rst")
## the metrics indicate that the solution with 4 variables is better than the one with 2 variables
## as for the value of K, that is unclear and should be decided based on the specific application case
## that is, why do we want to cluster Twitter users and if we prefer more fine grained grouping or
## coarse grained solution would be preferable

######################
## UTILITY FUNCTIONS #
######################

## function that computes the difference between two subsequent values
compute.difference <- function(values) {
  dif <- vector(mode = "numeric", length = length(values))
  dif[1] <- NA
  for(i in 1:(length(values)-1)) {
    dif[i+1] <- abs(values[i+1] - values[i])
  }
  dif
}

## function that provides summary statistics about clusters
summary.stats <- function(feature.set, clusters, cl.num) {
  sum.stats <- aggregate(x = feature.set, 
                         by = list(clusters), 
                         FUN = function(x) { 
                           m <- mean(x, na.rm = T)
                           sd <- sqrt(var(x, na.rm = T))
                           paste(round(m, digits = 2), " (", 
                                 round(sd, digits = 2), ")", sep = "")
                         })
  sum.stat.df <- data.frame(cluster = sum.stats[,1], 
                            freq = as.vector(table(clusters)),
                            sum.stats[,-1])
  
  sum.stats.transpose <- t( as.matrix(sum.stat.df) )
  sum.stats.transpose <- as.data.frame(sum.stats.transpose)
  attributes <- rownames(sum.stats.transpose)
  sum.stats.transpose <- as.data.frame( cbind(attributes, sum.stats.transpose) )
  colnames(sum.stats.transpose) <- c( "attributes", rep("Mean (SD)", cl.num) )
  rownames(sum.stats.transpose) <- NULL
  sum.stats.transpose
}

