###################################################################
# ACKNOWLEDGEMENTS
# this example is based on the Chapter 10 of the R and Data Mining
# http://www.rdatamining.com/books/rdm 
################################################################### 

####################
## GET THE TWEETS ##
####################

tweets <- read.csv(file = "data/twitter/tweets_apple.csv", stringsAsFactors = F)
str( tweets )

# create a vector of tweets' text out of the loaded tweets data
v.tweets <- tweets$Tweet
v.tweets[10:30]

###################################
## PREPROCESSING OF TWEETS' TEXT ##
###################################

# many tweets have almost exactly the same content, the only difference being
# the presence of "RT" (re-tweets) and/or different URL 
# so, start by removing URLs and "RT" occurances, and then remove duplicates

## <! introduce regular expressions !>
## excellent starting point: http://regex.bastardsbook.com/
for (i in 1:length(v.tweets)) {
  # remove URL
  v.tweets[i] <- gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", v.tweets[i])  
  # remove RT
  v.tweets[i] <- gsub("(RT)(:?)", "", v.tweets[i])
}

# check for duplicates
length(which(duplicated(v.tweets)==T))
# 64 duplicates; remove them
to.remove <- which(duplicated(v.tweets))
v.tweets <- v.tweets[-to.remove]

# the tm package is required for text processing
# install.packages("tm")
library(tm)

# build a corpus
## <! explain what a corpus is !>
tw.corpus <- Corpus(VectorSource(v.tweets))
# let's see what a corpus item look like
str(tw.corpus[[1]])

# create a copy of the coprus as we'll need it later
original.corpus <- tw.corpus

# tm_map() f. allows for performing different transformations on the corpus
# list of frequently used transformations can be obtained with getTransformations() f.
getTransformations()
# the purpose of all the transformations is to reduce the diversity among the words and 
# remove words that are of low importance

# convert text to lower case
tw.corpus <- tm_map(tw.corpus, tolower)
tw.corpus <- tm_map(tw.corpus, PlainTextDocument)
print.tweets( tw.corpus, from = 1, to = 20 )

# when processing tweets, we often remove user references completely
# but this corpus is specific - it has many (meaningful) user references
# mostly references to Twitter accounts of various tech companies
# so, we will remove only '@' sign in from of twitter users (@username) 
removeUserRefs <- function(x) gsub("@(\\w+)", "\\1", x)
tw.corpus <- tm_map(tw.corpus, removeUserRefs)
print.tweets( tw.corpus, from = 1, to = 20 )

# remove hash (#) sign fro hastags
removeHash <- function(x) gsub("#([[:alnum:]]+)", "\\1", x)  
tw.corpus <- tm_map(tw.corpus, removeHash)
print.tweets( tw.corpus, from = 10, to = 30 )

# replace :-), :), :(, :-( with "SMILEY"
replaceSmiley <- function(x) gsub("[:|;]([-| ]?)[\\)|\\(]", "SMILEY", x)
tw.corpus <- tm_map(tw.corpus, replaceSmiley)
print.tweets( tw.corpus, from = 10, to = 30 )

# use regular expressions to detect multi-part words, that is, compound words where two words or numbers are connected 
# with a hyphen (e.g. 1-2, 200-million) or a dot or a slash and then split them into 2 words 
splitMultiWords <- function(x) gsub("([[:alnum:]]+)[-|\\.|_|/]([[:alnum:]]+)", "\\1 \\2", x)
tw.corpus <- tm_map(tw.corpus, splitMultiWords)

## <! explain what stopwords are; 'general' stopwords and 'corpus-specific' stopwords !>
# add few extra ('corpus-specific') stop words ("rt" and "apple") to the 'general' stopwords 
# for the English language
stopwords('english')[100:120]
tw.stopwords <- c(stopwords('english'), "apple")
# remove stopwords from corpus
tw.corpus <- tm_map(tw.corpus, removeWords, tw.stopwords)
print.tweets( tw.corpus, from = 1, to = 20 )

# remove punctuation 
tw.corpus <- tm_map(tw.corpus, removePunctuation)
print.tweets( tw.corpus, from = 1, to = 20 )

# remove only stand-alone numbers (not numbers in e.g. iphone7 or g3) 
removeStandAloneNumbers <- function(x) gsub(" \\d+ ", "", x)
tw.corpus <- tm_map(tw.corpus, removeStandAloneNumbers)
print.tweets( tw.corpus, from = 1, to = 20 )

# strip whitespace
tw.corpus <- tm_map(tw.corpus, stripWhitespace)

# important for stemming to work properly and for subsequent creation of DTM
tw.corpus <- tm_map(tw.corpus, PlainTextDocument)

## <! explain stemming and why it is done !>
# word stemming is often part of text pre-processing; it helps in reducing the variability 
# in the word set, and is required for any text analysis that relies on the bag-of-words approach
# it is about reducing words to their common stem; e.g., words "update", "updated" and 
# "updating" would all be stemmed to "updat"
# it is often done with the Snowball stemmer: http://snowball.tartarus.org/
# alternative to stemming is lematization, but that is more complex

# to use the Snawball stemmer in R, we need the SnowballC package
#install.packages("SnowballC")
library(SnowballC)

# since we might later want to have words in their 'regular' form,
# we will keep a copy of the corpus before stemming it
tw.corpus.backup <- tw.corpus

# now, do the stemming
tw.corpus <- tm_map(tw.corpus, stemDocument, language = "english") 
print.tweets( tw.corpus, from = 1, to = 20)

#####################################
## BUILDING A DOCUMENT-TERM MATRIX ##
#####################################

## <! explain what DTM is !>
# Document Term Matrix (DTM) represents the relationship between terms and documents, 
# where each row stands for a document and each column for a term, and an entry is 
# weight of the term in the corresponding document

# the 'global' parameter is altered to require a word to appear in at least ~1% of tweets
# (10 out of 1117), and at most in ~95% of tweets to be included in the matrix; term weighting scheme is set to TF (term frequency)
# check the documentation of the TermDocumentMatrix() f. for other useful control parameters
tw.corpus <- tm_map(tw.corpus, PlainTextDocument)
dtm <- DocumentTermMatrix(tw.corpus, 
                          control = list(bounds = list(global = c(10,1060)),
                                         wordLengths = c(2,Inf),
                                         weighting = weightTf))
dtm
# we have very sparse DTM matrix - too many (98%) zero values
# however, this is a usual thing
# let's see what the matrix looks like, just a small bit of it
inspect(dtm[100:120, 100:120])

# let's see the terms that appear at least 10 times in the whole corpus
findFreqTerms(dtm, lowfreq = 10)
# we can also inspect the frequency of accurance of all the terms
colSums(as.matrix(dtm))
# better if they are sorted
sort(colSums(as.matrix(dtm)), decreasing = T)

#################################################
## CLUSTERING TWEETS USING K-MEDOIDS TECHNIQUE ##
#################################################

# to cluster documents (tweets), we need to transform the DTM into a classical matrix format
dtm.final <- as.matrix(dtm)
dtm.final[1:10, 1:10]

## let's give names to the docs (tweets) so that we can trace them through the subsequent analysis
## since we do not have titles of tweets, we'll use the first 40 characters of their content
for(i in 1:nrow(dtm.final)) {
  row.names(dtm.final)[i] <- paste(substr(original.corpus[[i]]$content, start = 1, stop = 50), "...", sep = "")
}
dtm.final[1:10,1:5]

## <! introduce the K-medoids clustering !>
# we'll now try the k-medoids clustering method, which uses medoids instead of means
# to represent clusters; a medoid can be defined as the element of a cluster 
# whose average dissimilarity to all the other elements of the cluster is minimal 
# (it is the most centrally located element of the cluster);
# while in k-means a cluster center is the mean value of all the cluster elements,
# in k-medoids, a cluster center is always an instance from the dataset;
# k-medoids is more robust to noise and outliers than k-means; 
# for an intuitive comparison of k-means and k-medoids, check this link:
# http://stats.stackexchange.com/questions/156210/difference-between-k-means-and-k-medoid
# it provides the so-called silhouette plot that can be used to evaluate the quality 
# of the clustering
# Wikipedia offers a simple explanation and example of this clustering method:
# https://en.wikipedia.org/wiki/K-medoids

# we use the pamk() f. which implements the Partitioning Around Medoids (PAM) clustering algorithm
# install.packages('fpc')
# install.packages("cluster")
library(fpc)
library(cluster)

# partitioning around medoids (PAM) is an algorith for finding medoids
# pamk f. implements PAM algorithm and estimates the optimal number of clusters
set.seed(1211)
pam.res <- pamk(dtm.final, krange = c(3:8))
# number of clusters identified as the best
k <- pam.res$nc
k
# useful results are, in fact, stored in the pamobject element
pam.res <- pam.res$pamobject
# let's check the distribution of tweets across the clusters
table( pam.res$clustering )

# let's check the terms that appear in cluster medoids
for (i in 1:k) {
  cat(paste("cluster", i, ": "))
  cat(colnames(pam.res$medoids)[which(pam.res$medoids[i,]>0)], "\n")
}

# create a word cloud for each cluster to better understand what clusters are about
# the f. requires TermDocument matrix; we have DocumentTerm matrix; we need to transpose the matrix
createWordCloud( t(dtm.final[pam.res$clustering == 1,]) )
createWordCloud( t(dtm.final[pam.res$clustering == 2,]) )
createWordCloud( t(dtm.final[pam.res$clustering == 3,]) )

# Let's now see a sample of tweets from each cluster
tweets.cl.dist <- as.matrix(table( pam.res$clustering ))
for(i in 1:k) {
  cat(paste("\nCLUSTER", i, ":\n"))
  n <- tweets.cl.dist[i]
  # if the number of tweets is <=10, print them all
  if(n <= 10) print(v.tweets[pam.res$clustering==i])
  # otherwise, take a random sample of 10 tweets
  else { 
    indices <- sample(x = 1:n, size = 10, replace = F)
    cl.tweets <- v.tweets[pam.res$clustering==i]
    print(unlist(cl.tweets[indices])) 
  }
}


# Finally, we'll inspect the silhouette plot, to more objectively assess the quality of the clusters
si <- silhouette(pam.res)
pdf('3_medoids_silhouette_plot.pdf')
plot(si, col = c('#e41a1c','#377eb8','#4daf4a')) # '#984ea3','#ff7f00','#ffff33'))
dev.off()
# we can see also the summary statistics for the silhouette width
summary(si)

# interpretation of the silhouette plot: 
# - a large silhouette width (close to 1) suggests that the corresponding observations
#   are very well clustered; 
# - a small silhouette with (close to 0) means that the observations lay between two clusters
# - observations with a negative silhouette width are probably placed in the wrong cluster

# we can examine silhouette width and the assocaited information directly;
# for instance, let's examine the info for the first 10 documents from the corpus
si[1:10,]
# we've seen that some instances (documents/tweets) have negative silhouette width
# meaning they are incorrectly clustered; let's inspect that
badly.clustered <- as.vector(which(si[,3] < 0.05))
length(badly.clustered)
# let's see those badly clustered tweets
badly.clust.df <- data.frame(tweets=row.names(si[badly.clustered,]), 
                             alt.clust=si[badly.clustered,2], row.names = NULL)
require(knitr)
kable(badly.clust.df, format = "markdown")

#######################
## UTILITY FUNCTIONS ##
#######################
#install.packages("wordcloud")
library(wordcloud)

## function for creating a word cloud
createWordCloud <- function( termDocRegMatrix ) {
  
  word.freq <- rowSums( termDocRegMatrix )
  word.freq <- sort(word.freq, decreasing=T)
  #keep the words with the given min frequency
  word.freq <- subset(word.freq, word.freq > mean(word.freq))
  
  words <- names(word.freq)
  
  #now, we can draw the word cloud
  set.seed(1234)
  gray.levels <- gray( (word.freq+10) / (max(word.freq)+10) )
  wordcloud(words = words, freq = word.freq, 
            random.order = F, colors = gray.levels)
}

## a utility function for printing a sample of tweets from the corpus
print.tweets <- function(corpus, from = 1, to = 10) {
  for(i in from : to) {
    cat( paste("[[", i, "]] ", sep = ""))
    writeLines(text = strwrap(corpus[[i]], width = 80))
    cat("\n")
  }
}

