###################################################################
# ACKNOWLEDGEMENTS
# this example is based on the Chapter 10 of the R and Data Mining
# http://www.rdatamining.com/books/rdm 
#
# the dataset (a set of tweets by @RDataMining) is downloaded from:
# http://www.rdatamining.com/data/rdmTweets-201306.RData
################################################################### 

####################
## GET THE TWEETS ##
####################

# install.packages("twitteR")
library(twitteR)

getwd()
load(file = "data/twitter/rdmTweets-201306.RData")

# the data available for each tweet
str( tweets[[1]] )

# create a vector of tweets' text out of the loaded tweets data
v.tweets <- do.call(what = "rbind", 
                     args = lapply(tweets, 
                                   function(x) {
                                     c(text = as.character(x$text))}))

v.tweets[1:10]

###################################
## PREPROCESSING OF TWEETS' TEXT ##
###################################

# the tm package is required for text processing
# install.packages("tm")
library(tm)

# build a corpus
## <! explain what a corpus is !>
tw.corpus <- Corpus(VectorSource(v.tweets))
# let's see what a corpus item look like
str(tw.corpus[[1]])

# print the first 20 tweets
print.tweets( tw.corpus, from = 1, to = 20 )

# tm_map() f. allows for performing different transformations on the corpus
# list of frequently used transformations can be obtained with getTransformations() f.
getTransformations()
# the purpose of all the transformations is to reduce the diversity among the words and 
# remove words that are of low importance

# convert text to lower case
tw.corpus <- tm_map(tw.corpus, tolower)
tw.corpus <- tm_map(tw.corpus, PlainTextDocument)
print.tweets( tw.corpus, from = 1, to = 20 )

# remove references to other twitter users (@username) 
## <! introduce regular expressions !>
## excellent starting point: http://regex.bastardsbook.com/
removeUserRefs <- function(x) gsub("@\\w+", "", x)
tw.corpus <- tm_map(tw.corpus, removeUserRefs)
print.tweets( tw.corpus, from = 1, to = 10 )

## <! explain what stopwords are; 'general' stopwords and 'corpus-specific' stopwords !>
# add two extra ('corpus-specific') stop words ("rt" and "r") to the 'general' stopwords 
# for the English language
stopwords('english')[100:120]
tw.stopwords <- c(stopwords('english'), "rt", "r")
# remove stopwords from corpus
tw.corpus <- tm_map(tw.corpus, removeWords, tw.stopwords)
print.tweets( tw.corpus, from = 1, to = 20 )

# remove punctuation (this will also remove the hash sign (#) from hashtags)
tw.corpus <- tm_map(tw.corpus, removePunctuation, preserve_intra_word_dashes = T)
print.tweets( tw.corpus, from = 1, to = 20 )
# in case we needed links from the tweets, we would have to extract them before 
# removing punctuation or perform the removal of punctuation in a different way

# remove URLs
removeURL <- function(x) gsub("http[[:alnum:]]+", "", x)  #[[:alnum:]] means [0-9A-Za-z] 
tw.corpus <- tm_map(tw.corpus, removeURL)
print.tweets( tw.corpus, from = 1, to = 20 )

# use regular expressions to detect words where two numbers are connected with a hyphen (e.g. 1-2) 
# or a word and a number are connected with a hyphen (e.g. 200-million) and then split them into 2 words 
splitWordsWithHyphen <- function(x) gsub("(\\d+)[-]([[:alnum:]]+)", "\\1 \\2", x)
tw.corpus <- tm_map(tw.corpus, splitWordsWithHyphen)
print.tweets( tw.corpus, from = 1, to = 20 )

# remove numbers
tw.corpus <- tm_map(tw.corpus, removeNumbers)
print.tweets( tw.corpus, from = 1, to = 20 )

# strip whitespace
tw.corpus <- tm_map(tw.corpus, stripWhitespace)

# important for stemming to work properly and for subsequent creation of DTM
tw.corpus <- tm_map(tw.corpus, PlainTextDocument)

## <! explain stemming and why it is done !>
# word stemming is often part of text pre-processing; it helps in reducing the variability 
# in the word set, and is required for any text analysis that relies on the bag-of-words approach
# it is about reducing words to their common stem; e.g., words "update", "updated" and 
# "updating" would all be stemmed to "updat"
# it is often done with the Snowball stemmer: http://snowball.tartarus.org/
# alternative to stemming is lematization, but that is more complex

# to use the Snawball stemmer in R, we need the SnowballC package
install.packages("SnowballC")
library(SnowballC)

# since we might later want to have words in their 'regular' form,
# we will keep a copy of the corpus before stemming it
tw.corpus.backup <- tw.corpus

# now, do the stemming
tw.corpus <- tm_map(tw.corpus, stemDocument, language = "english") 
print.tweets( tw.corpus, from = 1, to = 20)

#####################################
## BUILDING A DOCUMENT-TERM MATRIX ##
#####################################

## <! explain what DTM is !>
# Document Term Matrix (DTM) represents the relationship between terms and documents, 
# where each row stands for a document and each column for a term, and an entry is 
# weight of the term in the corresponding document

# the 'global' parameter is altered to require a word to appear in at least 1% of tweets
# (3 out of 320), to be included in the matrix; term weighting scheme is set to TF (term frequency)
# check the documentation of the TermDocumentMatrix() f. for other useful control parameters
tw.corpus <- tm_map(tw.corpus, PlainTextDocument)
dtm <- DocumentTermMatrix(tw.corpus, 
                          control = list(bounds = list(global = c(3,Inf)),
                                         weighting = weightTf))
dtm
# we have very sparse DTM matrix - too many (99%) zero values
# however, this is a usual thing
# let's see what the matrix looks like, just a small bit of it
inspect(dtm[100:120, 220:240])

# let's see the terms that appear at least 10 times in the whole corpus
findFreqTerms(dtm, lowfreq = 10)
# we can also inspect the frequency of accurance of all the terms
colSums(as.matrix(dtm))

#################################################
## CLUSTERING TWEETS USING K-MEDOIDS TECHNIQUE ##
#################################################

# first, let's remove sparse terms, to focus clustering on more relevant terms
# if a term has sparsity (i.e., the percentage of empty elements) above the value specified 
# by the 'sparsity' parameter, it will be removed from the resulting DTM
dtm.reduced <- removeSparseTerms(dtm, sparse=0.985)
dtm.reduced

# to cluster documents (tweets), we need to transform the DTM into a classical matrix format
dtm.final <- as.matrix(dtm.reduced)
dtm.final[1:10, 1:10]

## let's give names to the docs (tweets) so that we can trace them through the subsequent analysis
## since we do not have titles of tweets, we'll use the first 40 characters of their content
for(i in 1:nrow(dtm.final)) {
  row.names(dtm.final)[i] <- paste(substr(v.tweets[i], start = 1, stop = 40), "...", sep = "")
#  row.names(dtm.final)[i] <- tw.corpus[[i]]$content
}
dtm.final[1:10,1:5]

## <! introduce the K-medoids clustering !>
# we'll now try the k-medoids clustering method, which uses medoids instead of means
# to represent clusters; a medoid can be defined as the element of a cluster 
# whose average dissimilarity to all the other elements of the cluster is minimal 
# (it is the most centrally located element of the cluster);
# while in k-means a cluster center is the mean value of all the cluster elements,
# in k-medoids, a cluster center is always an instance from the dataset;
# k-medoids is more robust to noise and outliers than k-means; 
# it provides the so-called silhouette plot that can be used to evaluate the quality 
# of the clustering
# Wikipedia offers a simple explanation and example of this clustering method:
# https://en.wikipedia.org/wiki/K-medoids

# we use the pamk() f. which implements the Partitioning Around Medoids (PAM) clustering algorithm
# install.packages('fpc')
# install.packages("cluster")
library(fpc)
library(cluster)

# partitioning around medoids with estimation of the optimal number of clusters
pam.res <- pamk(dtm.final, krange = c(3:8))
# number of clusters identified as the best
k <- pam.res$nc
k
# useful results are, in fact, stored in the pamobject element
pam.res <- pam.res$pamobject
# let's check the distribution of tweets across the clusters
table( pam.res$clustering )

# let's check the terms that appear in cluster medoids
for (i in 1:k) {
  cat(paste("cluster", i, ": "))
  cat(colnames(pam.res$medoids)[which(pam.res$medoids[i,]>0)], "\n")
}
# obviously, we have 4 well defined clusters and one (the 1st) which is extremelly broad

# Let's now see a sample of tweets from each cluster
tweets.cl.dist <- as.matrix(table( pam.res$clustering ))
for(i in 1:k) {
  cat(paste("\nCLUSTER", i, ":\n"))
  n <- tweets.cl.dist[i]
  # if the number of tweets is <=10, print them all
  if(n <= 10) print(v.tweets[pam.res$clustering==i])
  # otherwise, take a random sample of 10 tweets
  else { indices <- sample(x = 1:n, size = 10, replace = F)
         cl.tweets <- v.tweets[pam.res$clustering==i]
         print(unlist(cl.tweets[indices])) }
}


# Finally, we'll inspect the silhouette plot, to more objectively assess the quality of the clusters
si <- silhouette(pam.res)
pdf('4medoids_silhouette_plot.pdf')
plot(si, col = c("red", "green", "blue", "yellow"))
dev.off()
# we can see also the summary statistics for the silhouette width
summary(si)

# interpretation of the silhouette plot: 
# - a large silhouette width (close to 1) suggests that the corresponding observations
#   are very well clustered; 
# - a small silhouette with (close to 0) means that the observations lay between two clusters
# - observations with a negative silhouette width are probably placed in the wrong cluster

# we can examine silhouette width and the assocaited information directly;
# for instance, let's examine the info for the first 10 documents from the corpus
si[1:10,]
# we've seen that some instances (documents/tweets) have negative silhouette width
# meaning they are incorrectly clustered; let's inspect that
badly.clustered <- as.vector(which(si[,3] < 0))
length(badly.clustered)
# let's see those badly clustered tweets
badly.clust.df <- data.frame(tweets=row.names(si[badly.clustered,]), 
                             alt.clust=si[badly.clustered,2], row.names = NULL)
require(knitr)
kable(badly.clust.df, format = "markdown")


#######################
## UTILITY FUNCTIONS ##
#######################

# a utility function for printing a sample of tweets from the corpus
print.tweets <- function(corpus, from = 1, to = 5) {
  for(i in from : to) {
    cat( paste("[[", i, "]] ", sep = ""))
    writeLines(text = strwrap(corpus[[i]], width = 80))
    cat("\n")
  }
}

