library(tm)
library(SnowballC)

## data source: https://archive.ics.uci.edu/ml/datasets/NYSK
reviews.data <- read.csv(file = "data/reviews/yelp_labelled.csv", 
                         colClasses = c("factor", "character"), sep = ";")
str(reviews.data)

# create a vector of articles' texts to be used for further analysis
reviews <- as.vector(reviews.data$text)

# create corpus
r.corpus <- Corpus(VectorSource(reviews))
str(r.corpus[[1]])

# utility f. for printing a subset of articles from the corpus
print.reviews <- function(corpus, from = 1, to = 10) {
  for(i in from : to) {
    cat( paste("[[", i, "]] ", sep = ""))
    writeLines(text = strwrap(corpus[[i]], width = 80))
    cat("\n")
  }
}

print.reviews(r.corpus)

# convert text to lower case
r.corpus <- tm_map(r.corpus, tolower)
r.corpus <- tm_map(r.corpus, PlainTextDocument)

# remove stopwords from corpus
more.stopwords <- c("will", "yet", "want", "say", "said", "can", "may", "now", "also", "get")
to.remove <- c(stopwords(kind = "en"), more.stopwords)
r.corpus <- tm_map(r.corpus, removeWords, to.remove)
print.reviews( r.corpus)

# remove URLs
removeURLs <- function(x) gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", x)  
r.corpus <- tm_map(r.corpus, removeURLs)

# remove punctuation 
r.corpus <- tm_map(r.corpus, removePunctuation)
print.reviews( r.corpus)

# remove unicode characters: \u2019, \u201c, ...
# take from: http://stackoverflow.com/questions/24147816/remove-unicode-f0b7-from-corpus-text
removeCharacters <-function (x, characters)  {
  gsub(sprintf("(*UCP)(%s)", paste(characters, collapse = "|")), "", x, perl = TRUE)
}
r.corpus <- tm_map(r.corpus, removeCharacters, 
                      c("\u2019","\u201c", "\u201d", "\u2014", "\u00a0", "\u00f3", 
                        "\u00e1", "\u00ed", "\u00A9"))

# remove numbers
r.corpus <- tm_map(r.corpus, removeNumbers)

# strip whitespace
r.corpus <- tm_map(r.corpus, stripWhitespace)
print.reviews( r.corpus )

# important for stemming to work properly and for subsequent creation of DTM
r.corpus <- tm_map(r.corpus, PlainTextDocument)

# since we might later want to have words in their 'regular' form,
# we will keep a copy of the corpus before stemming it
r.corpus.backup <- r.corpus

# now, do the stemming
r.corpus <- tm_map(r.corpus, stemDocument, language = "english") 
print.reviews( r.corpus, 10, 30 )

# build Document-Term Matrix
r.corpus <- tm_map(r.corpus, PlainTextDocument)
tdm <- TermDocumentMatrix(r.corpus, 
                          control = list(bounds = list(global = c(10,950)), # remove words that appear
                                         wordLengths = c(3,Inf),             # in less than ~5% and more than ~90% of docs
                                         weighting = weightTf))
tdm

# let's see the terms that appear at least 50 times in the whole corpus
# findFreqTerms(dtm, lowfreq = 50)
# we can also inspect the frequency of accurance of the most frequent terms
sort(rowSums(as.matrix(tdm)), decreasing = T)[1:100]

# to cluster documents, we need to transform the DTM into a classical matrix format
dtm.final <- as.matrix(t(tdm))
dtm.final[1:10, 1:10]

## let's give names to the docs so that we can trace them through the subsequent analysis
## use document titles
for(i in 1:nrow(dtm.final)) {
  row.names(dtm.final)[i] <- paste(substr(reviews[i], start = 1, stop = 50), "...", sep = "")
}
dtm.final[1:10,1:5]


## DO K-MEDOIDS CLUSTERING
library(fpc)
library(cluster)

# partitioning around medoids with estimation of the optimal number of clusters
set.seed(1111)
pam.res <- pamk(dtm.final, krange = c(2:6))
# number of clusters identified as the best
k <- pam.res$nc
k
# useful results are, in fact, stored in the pamobject element
pam.res <- pam.res$pamobject
# let's check the distribution of documents across the clusters
table( pam.res$clustering )

# check the most frequent terms in cluster medoids
for (i in 1:k) {
  cat(paste("cluster", i, ": "))
  most.freq.clust.terms <- sort(which(pam.res$medoids[i,]>0), decreasing = T)[1:10]
  cat(colnames(pam.res$medoids)[most.freq.clust.terms], "\n")
}

# Let's now see a sample of reviews from each cluster
reviews.cl.dist <- as.matrix(table( pam.res$clustering ))
for(i in 1:k) {
  cat(paste("\nCLUSTER", i, ":\n"))
  n <- reviews.cl.dist[i]
  # if the number of reviews is <=10, print them all
  if(n <= 10) print(reviews[pam.res$clustering==i])
  # otherwise, take a random sample of 10 reviews
  else { 
    set.seed(1211)
    indices <- sample(x = 1:n, size = 10, replace = F)
    cl.reviews <- reviews[pam.res$clustering==i]
    print(unlist(cl.reviews[indices])) 
  }
}

# add the obtained clusters to the initial dataset
# cluster 1 is about negative sentiment, cluster 2 (tends towards) positive
reviews.data$cluster <- factor(pam.res$clustering, levels = c(1,2), labels = c("negative", "positive"))
str(reviews.data)
# check in how many instances class label and cluster are the same
nrow( reviews.data[reviews.data$label == reviews.data$cluster, ])
# in 513, 51.3% of cases - not good, roughly equal to random guessing

# Finally, we'll inspect the silhouette plot, to more objectively assess the quality of the clusters
si <- silhouette(pam.res)
pdf('2_medoids_silhouette_plot.pdf')
plot(si, col = c('#e41a1c','#377eb8')) #'#4daf4a', '#984ea3, #'#ff7f00','#ffff33'))
dev.off()
# we can see also the summary statistics for the silhouette width
summary(si)


# we've seen that some instances (documents) have negative silhouette width
# meaning they are incorrectly clustered; let's inspect that
badly.clustered <- as.vector(which(si[,3] < 0))
length(badly.clustered)
# let's see those badly clustered documents
badly.clust.df <- data.frame(docs=row.names(si[badly.clustered,]), 
                             alt.clust=si[badly.clustered,2], row.names = NULL)
require(knitr)
kable(badly.clust.df, format = "markdown")
