###################################################################
# ACKNOWLEDGEMENTS
#
# The dataset used in this example is a preprocessed subset of the Ling-Spam Dataset
# (http://csmining.org/index.php/ling-spam-datasets.html). It is based on 960 real email 
# messages from a linguistics mailing list.
# The dataset was prepared for the Machine Learning course taught by Stanford Prof. Andrew Ng; 
# it is downloaded from:
# http://openclassroom.stanford.edu/MainFolder/DocumentPage.php?course=MachineLearning&doc=exercises/ex6/ex6.html
################################################################### 

##################
## LOADING DATA ##
##################

# we'll start by loading the data
# the dataset we'll be working with is split into two subsets: a 700-email subset for training 
# and a 260-email subset for testing; each subset contains 50% spam and 50% nonspam messages. 
getwd()

# we'll first load the data that will be used for training and validation of the classifier 

# directory with spam messages for training
spam.train.dir <- "data/emails/spam-train"
spam.train.files <- list.files(path = spam.train.dir)
length(spam.train.files)
spam.train.files[1:10]

# directory with nonspam messages for training
nonspam.train.dir <- "data/emails/nonspam-train"
nonspam.train.files <- list.files(path = nonspam.train.dir)

# auxiliary f. for reading text from a file
read.text = function(pathname) {
  return (paste(readLines(pathname), collapse="\n"))
}

train.data <- data.frame(stringsAsFactors = F)
# read text from the training set of spam messages, and use it to build the train.data dataframe
for(i in 1:length(spam.train.files)) {
  file.content <- read.text( paste(spam.train.dir, "/", spam.train.files[i], sep = "") ) 
  train.data <- rbind(train.data, cbind(text = file.content, label = "spam"))  
}
str(train.data)
train.data$text <- as.character(train.data$text)
head(train.data)

# read text from the training set of nonspam messages, and use it to extend the train.data dataframe
for(i in 1:length(nonspam.train.files)) {
  file.content <- read.text( paste(nonspam.train.dir, "/", nonspam.train.files[i], sep = "") ) 
  train.data <- rbind(train.data, cbind(text = file.content, label = "ham"))  
}
tail(train.data)


# the way we created the train.data dataframe, the first 350 observations are of the class 'spam',
# and the rest are 'nonspam'; since we would need to split this dataset into training and validation
# subsets with roughly equal percentage of spam and nonspam messages in each of the subsets, 
# we need to 'shuffle' the rows, to introduce some 'diversity'
set.seed(12345)
# generate random numbers that follow uniform distribution
# train.size random numbers will be generated
train.size <- nrow(train.data)
randomized <- runif(n = train.size)
# shuffle the rows in the order defined by the randomly generated numbers 
train.data <- train.data[ order(randomized), ]
# let's check if the data rows have really been shuffled
train.data[1:10, 2]

# now, load the dataset that will be used for testing the classifier 

spam.test.dir <- "data/emails/spam-test"
spam.test.files <- list.files(path = spam.test.dir)
length(spam.test.files)

nonspam.test.dir <- "data/emails/nonspam-test"
nonspam.test.files <- list.files(path = nonspam.test.dir)
length(nonspam.test.files)

# read text from the testing set, and use it to build the testing.data dataframe
testing.data <- data.frame(stringsAsFactors = F)
for(i in 1:length(spam.test.files)) {
  file.content <- read.text( paste(spam.test.dir, "/", spam.test.files[i], sep = "") ) 
  testing.data <- rbind(testing.data, cbind(text = file.content, label = "spam"))  
}
for(i in 1:length(nonspam.test.files)) {
  file.content <- read.text( paste(nonspam.test.dir, "/", nonspam.test.files[i], sep = "") ) 
  testing.data <- rbind(testing.data, cbind(text = file.content, label = "ham"))  
}
str( testing.data )
testing.data$text <- as.character(testing.data$text)


######################
## DATA PREPARATION ##
######################

# install and load the required library
install.packages("tm")
library(tm)

# we will create the corpus using both training and testing datasets
# so, let's merge them first
all.data <- rbind(train.data, testing.data)
dim(all.data)
# 960 instances in total: first 700 are for training, the rest (260) for testing 

# create corpus
corpus <- Corpus(VectorSource(all.data$text))
str(corpus[1])

# the loaded text is already pre-processed: 
# - stop-words have been removed
# - numbers and punctuation have been removed 
# - text has been converted to lower case
# - it has also been lemmatized
# - all white spaces (tabs, newlines, spaces) have been trimmed to a single space character.
# so, no need for pre-processing it here

# create document-term matrix using words of length 2+
dtm <- DocumentTermMatrix(corpus, control = list(wordLengths=c(2, Inf)))
dtm

# remove sparse terms
dtm <- removeSparseTerms(dtm, 0.975)
dtm
# 1268 words preserved out of the original set of 22744 words

# since we want to classify documents, i.e., emails, we need to transform the 
# DTM into a 'simple' matrix
dtm <- as.matrix(dtm)
dim(dtm)

# for the ML algorithms that we'll use (kNN, NB), it is recommended that all the features 
# take values from roughly the same range of values
# let's inspect what is the current range of values for a sample of the features we have
summary( dtm[, sample(x = 1:ncol(dtm), size = 20, replace = F)] )

# obviously, value ranges do differ and there is a need for normalization
# we'll achive that by applying the following formula to each feature x:
# (x - min(x)) / (max(x) - min(x))
# we'll create a utility function that does the normalization...
normalize.feature <- function( feature ) {
  return ((feature - min(feature))/(max(feature) - min(feature)))
}

# ...and, now apply the function to each feature (column) in our dataset  
norm.dtm <- apply(X = dtm, MARGIN = 2, FUN = normalize.feature)
norm.dtm <- as.data.frame( norm.dtm )
# let's check if the features have been normalized
summary(norm.dtm[,c(1:10)])
# finally, add the TargetLabel column to our normalized dataset
norm.dtm$TargetLabel <- all.data$label
# check that the TargetLabel column is added as we intended
norm.dtm[1:10, (ncol(norm.dtm)-10):ncol(norm.dtm)]


# we'll now split the training data into training and validation subsets
# we'll take 75% of the original training set for training the ML model; 
# the remaining 30% is to be used for validation
# note: we're not using testing data for model building 
training.size <- ceiling( 700 * 0.75 ) # remember, we have 700 emails in the original training dataset
training.set <- norm.dtm[1:training.size, ]
validate.set <- norm.dtm[(training.size+1):700, ]
nrow(training.set)
nrow(validate.set)


#####################################
######  MODEL BUILDING: KNN    ######
#####################################

## <! introduce the KNN method !>

# to execute the KNN classification in R, we need to provide the knn() f. with
# 1) training data with no labels, 2) validation/test data with no labels, and 
# 3) labels for the training set. 
# So, before calling the knn() f., let's first extract labels
labels <- norm.dtm[, "TargetLabel"]
head(labels)

# run the knn() f. and create the model
install.packages("class")
library(class)

# number of neighbours to consider
k <- 9
# number of columns in the (training and validation) datasets 
m <- ncol(training.set)
knn.pred <- knn(train = training.set[, c(1:(m-1))], # the last column is the TargetLabel
                test = validate.set[, c(1:(m-1))],  # so, we're excluding it
                cl = labels[1:training.size],
                k = k)

# create the confusion satrix
conf.mat <- table(Predictions = knn.pred, Actual = labels[(training.size+1):700])
conf.mat

# a utility function for computing evaluation measures for the classifier
compute.eval.measures <- function(cm, n) {
  acc <- sum(diag(conf.mat))/n
  # since we're interested in predicting spam, we'll compute precision and recall
  # using values of the confusion matrix that are related to the spam outcome
  prec <- cm[1,1] / (cm[1,1] + cm[1,2]) 
  rec <- cm[1,1] / (cm[1,1] + cm[2,1])
  f1 <- (2*prec*rec)/(prec+rec)
  # return the performance metrics
  c(accuracy = acc, precision = prec, recall = rec, F1=f1)  
}

compute.eval.measures(cm = conf.mat, n = nrow(validate.set) )
# the overall performance is not bad, the recall is particularly good
# however, the confusion matrix shows that there are too many false positives (FP), i.e., 
# emails that are nonspam but were classified as spam; the situation with false negatives (FN)
# (spam emails that were recognized as 'ham') is far better; however, for this type of 
# problem, it would be better if we can reduce FP, even at the expense of (slightly) 
# increasing FN; in other words, we should 'traid' some recall for better precision

# we made a guess about the number of neighbours; intead of guessing, we'll now run kNN with
# different values for k, and see which one gives the best performance; then, we'll use the 
# test set to test the model that proves to be the best

# it is recommended to choose an odd number for K, to avoid the situation of dealing with ties
# (half neighbours being of one class, the other half of the other class)
k.candidates <- c(3,5,7,9,11)
results <- data.frame()
for(i in 1:length(k.candidates)) {
  knn.pred <- knn(train = training.set[, c(1:(m-1))], # the last column is the TargetLabel
                  test = validate.set[, c(1:(m-1))],  # so, we're excluding it
                  cl = labels[1:training.size],
                  k = k.candidates[i])
  
  conf.mat <- as.matrix( table(knn.pred, labels[(training.size+1):700]) )
  eval.res <- compute.eval.measures(cm = conf.mat, n = nrow(validate.set) )
  results <- rbind(results, eval.res)
}
colnames(results) <- c("accuracy", "precision", "recall", "F1")
results$k <- k.candidates
results
# the results show that k=7 leads to the best accuracy and F1 values, also
# precision and recall are very close to the best values; so, we'll choose k=7
# let's now test the model using the test set

####################################
######  MODEL TESTING: KNN    ######
####################################

# we're now ready to test the kNN model (using k=5)
# we'll use both training and validation subsets to train the model
knn.pred <- knn(train = norm.dtm[1:700, c(1:(m-1))], # the last column is the TargetLabel
                test = norm.dtm[-(1:700), c(1:(m-1))],  # so, we're excluding it
                cl = labels[1:700],
                k = 7)

conf.mat <- as.matrix( table(knn.pred, labels[-(1:700)]) )
conf.mat
compute.eval.measures(cm = conf.mat, n = 260 )
# compared to the validation set, accuracy and precision are weaker, though recall has improved
# this is more or less expected (performance on the test set is always weaker than on training
# and validation sets)


##############################################
######  MODEL BUILDING AND TESTING: NB  ######
##############################################

install.packages("e1071")
library(e1071)

nb.model <- naiveBayes(TargetLabel ~ ., 
                       data = norm.dtm[1:700,], laplace = 1)

nb.pred <- predict(object = nb.model, 
                   newdata = norm.dtm[-(1:700), ], type = "class")

cm <- as.matrix ( table( predicted = nb.pred, true = labels[-(1:700)] ) )
cm

compute.eval.measures(cm = cm, n = 260 )

# accuracy of NB and kNN is almost exactly the same; 
# NB outperforms kNN in terms of precision, while kNN is better in recall
# since in the context of the given task (spam detection) precision is more important,
# the NB model should be chosen


#######################################################
## COMPARE WORD CLOUDS FOR SPAM AND NONSPAM MESSAGES ##
#######################################################

install.packages("wordcloud")
library(wordcloud)

# separate spam from nonspam emails
spam.dtm <- as.matrix( norm.dtm[ norm.dtm$TargetLabel == "spam", 1:(m-1)] )
nonspam.dtm <- as.matrix( norm.dtm[ norm.dtm$TargetLabel == "ham", 1:(m-1)] )

# function for creating a word cloud
createWordCloud <- function( termDocRegMatrix ) {
  
  word.freq <- rowSums( termDocRegMatrix )
  word.freq <- sort(word.freq, decreasing=T)
  #keep the words with the given min frequency
  word.freq <- subset(word.freq, word.freq > mean(word.freq))
  
  words <- names(word.freq)
  
  #now, we can draw the word cloud
  set.seed(1234)
  gray.levels <- gray( (word.freq+10) / (max(word.freq)+10) )
  wordcloud(words = words, freq = word.freq, 
            random.order = F, colors = gray.levels)
}

# show word clouds for positive and negative reviews one next to the other
par(mfrow = c(1,2))
createWordCloud(termDocRegMatrix = t(spam.dtm))
createWordCloud(termDocRegMatrix = t(nonspam.dtm))
par(mfrow = c(1,1))
