# This R script is largely based on the R code prepared for the Data Science
# Dojo YouTube series "Introduction to Text Analytics with R", available at:
# https://www.youtube.com/watch?v=-wCrClheObk     
# The (original) R code from this series is available at the following GitHub repo:
# https://github.com/datasciencedojo/IntroToTextAnalyticsWithR

# The script makes use of the following R packages:
# - ggplot2 - for visualization
# - caret - for various ML tasks
# - randomForest - for building classifiers based on the Random Forest algorithm
# - quanteda - for various text analytics tasks
# - stringr - for advanced string processing
# - irlba - for singular vector decomposition (SVD)
# - lsa - for computing cosine similarity
# - doSNOW - for multi-core parallel process execution 
# If you miss any of these, install them, before proceeding with the script
# install.packages("<package_name>")

# Set the seed to be used in various computations that depend on random processes
seed <- 19118

# TASK: classification of SMS text messages into spam and those that
# are not spam (ham).
# The input: a csv file with labelled SMS text messages.

# Load up the csv file with the data and examine it
spam.raw <- read.csv("data/spam.csv", stringsAsFactors = FALSE)
str(spam.raw)

# Clean up the data frame and assign names to variables 
spam.raw <- spam.raw[, 1:2]
names(spam.raw) <- c("Label", "Text")
str(spam.raw)
View(spam.raw)

# Convert the class label into a factor variable
spam.raw$Label <- as.factor(spam.raw$Label)

###########################
# INITIAL DATA EXPLORATION
###########################

# The first step, as always, is to explore the data.

# First, check the data to see if there are missing values
length(which(!complete.cases(spam.raw)))

# Next, examine the distibution of the class labels (that is, ham vs. spam messages)
table(spam.raw$Label)
prop.table(table(spam.raw$Label))

# As there is (i) a large disproportion in the number of ham vs spam messages,
# and as (ii) the dataset is large and would require a lot of time for processing
# (building ML models), we will reduce the number of ham messages. 
# This will be done through random sampling.
ham.indices <- which(spam.raw$Label == "ham")
# sample (randomly) 2000 ham messages to keep
set.seed(seed)
ham.indices.sample <- sort(sample(x = ham.indices, size = 2000, replace = FALSE))
# the difference between the set of all ham indices and the set of those to be kept,
# gives us the indices of ham messages to be removed
ham.indices.to.remove <- setdiff(ham.indices, ham.indices.sample)
spam.reduced <- spam.raw[-ham.indices.to.remove,]
# check the distribution after the removal of ham messages
table(spam.reduced$Label)
prop.table(table(spam.reduced$Label))

# Next up, let's get a feel for the distribution of text lengths of the SMS 
# messages in the data set by adding a new feature for the length of each message
spam.reduced$TextLength <- nchar(spam.reduced$Text)
summary(spam.reduced$TextLength)

# Visualize distribution with ggplot2, adding segmentation for ham/spam
library(ggplot2)

ggplot(spam.reduced, aes(x = TextLength, fill = Label)) +
  theme_bw() +
  geom_histogram(binwidth = 5) +
  labs(y = "Message Count", x = "Length of messages",
       title = "Distribution of message lengths across class labels") +
  scale_x_continuous(breaks = seq(0,1000, 100))

ggplot(spam.reduced, aes(x = Label, y = TextLength, fill = Label)) +
  theme_bw() +
  geom_boxplot(show.legend = FALSE) +
  labs(y = "Distribution of message length", x = "Spam vs Ham messages",
       title = "Distribution of message lengths across class labels")

# In general, spam messages tend to be longer than ham messages
# The charts (ie. distribution) suggest that TextLength might serve as 
# a relevant feature when building a classification model.

###############################
# CREATE TRAINING AND TEST SETS
###############################

# Split the data into a training set and a test set. 
# In a real-world ML task, we would want to use a three-way split into
# training, validation, and test sets.
#
# As our data has non-trivial class imbalance, we'll use the caret package to create 
# a random stratified split, that is, a random train/test split that ensures the 
# same ham/spam class label proportions in both the training and test portions.
library(caret)

# Create a 75%/25% stratified split. 
# Set the random seed for reproducibility.
set.seed(seed)
train.indices <- createDataPartition(spam.reduced$Label, p = 0.75, list = FALSE)
train <- spam.reduced[train.indices,]
test <- spam.reduced[-train.indices,]

# Verify proportions
prop.table(table(train$Label))
prop.table(table(test$Label))

################################################
# EXPLORATION AND PRE-PROCESSING OF TEXTUAL DATA
################################################

# Text analytics requires a lot of data exploration, data pre-processing
# and data wrangling. Let's explore some examples.

# HTML-escaped ampersand character:
train$Text[75]
# replace escaped ampersen occurrences with "and"
replaceEscAmp <- function(x) gsub("&amp;", "and", x, fixed = TRUE) 
train$Text <- replaceEscAmp(train$Text)

# HTML-escaped '<' and '>' characters
train$Text[161]
# removes such characters
removeLtGtChars <- function(x) gsub("&lt;|&gt;", "", x) 
train$Text <- removeLtGtChars(train$Text)

# A URL
train$Text[11]
# replace urls with URL term
replaceURL <- function(x) gsub("(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "URL", x) 
train$Text <- replaceURL(train$Text)


# There are many packages in the R ecosystem for performing text analytics.
# One of the latest is quanteda. It has many useful functions for
# quickly and easily working with text data; they are well explained in the
# quanteda docs:
# http://docs.quanteda.io/index.html
library(quanteda)
# stringr is another useful library, often used in conjunction with quanteda
library(stringr)

#
# Tokenizing text messages
#
# When tokenizing documents, a typical practice is to remove numbers, punctuation, and 
# symbols. However, by inspecting our dataset, we can notice that the presence of numbers 
# tends to be a good indicator of spam messages. So, we will initially remove just
# punctuation marks and symbols, and later remove numbers (after creating a feature based
# on the presence of numbers in message text).
#
?tokens
train.tokens <- tokens(train$Text, what = "word", 
                       remove_punct = TRUE, remove_symbols = TRUE)

# Take a look at a specific SMS message and see how it has been transformed
train$Text[6]
train.tokens[[6]]

###########################################################
## INTERSPERS TEXT PRE-PROCESSING WITH FEATURE ENGINEERING
###########################################################

#
# Create a feature that is the count of numbers in a message
#
sum(str_detect(train.tokens[[6]], "^[0-9]+$"))
# the f. receives a vector of tokens (words) and returns the count of tokens that are numbers 
numbers_count <- function(msg_tokens) {
  sum(str_detect(msg_tokens, "^[0-9]+$"))
}

# count of tokens that are numbers
train$NumCount <- sapply(X = train.tokens,
                         FUN = function(x) ifelse(length(x) == 0, # check for empty string tokens
                                                  yes = 0,
                                                  no = numbers_count(x)))
summary(train$NumCount)
# Examine the potential usefulness of the built feature
ggplot(data = train, mapping = aes(x = Label, y = NumCount, fill = Label)) +
  geom_boxplot() +
  theme_bw() + ylab("Count of numbers per message")
# NumCount looks as a promising feature

#
# Remove numbers and one-char tokens
#
# Now that we have made use of numbers to create a feature, we can remove them
# from the token set (bag of words)
train.tokens <- tokens_remove(x = train.tokens, 
                              pattern = "^[0-9]+$", valuetype = "regex")
# check if the numbers were removed
train.tokens[[6]]

# We have noticed that there are tokens with one character only; these should be removed
# as they are not informative 
train.tokens <- tokens_keep(x = train.tokens, min_nchar = 2)
train.tokens[[6]]

#
# Add the presence of a URL as a feature
#
msg_has_URL <- function(msg_tokens) sum(str_detect(msg_tokens, "URL"))
train$HasURL <- sapply(train.tokens, 
                       function(x) ifelse(msg_has_URL(x) > 0, yes = "Yes", no = "No"))
train$HasURL <- as.factor(train$HasURL)
summary(train$HasURL)
# Not promissing as only 15 messages has URL
xtabs(~ Label + HasURL, data = train)
# Still, out of 15 messages with URL, 14 are spam 

#
# Add the proportion of tokens with all capital letters as a feature
#
# Compute the proportion of tokens with all capital letters, as
# it might serve as a good feature for spam detection
sum(str_detect(train.tokens[[6]], "^[A-Z]+$"))
all_upper_case_count <- function(msg_tokens) {
  sum(str_detect(msg_tokens, "^[A-Z]+$"))
}

# number of tokens (words) per message (after numbers and one letter tokens were removed)
train$TokenCount <- sapply(X = train.tokens, length)
summary(train$TokenCount)
# proportion of tokens with all capital letters
train$AllUpperCaseProp <- sapply(X = train.tokens,
                                 FUN = function(x) ifelse(length(x) == 0, # check for messages that were stripped of tokens
                                                          yes = 0,
                                                          no = all_upper_case_count(x)))
train$AllUpperCaseProp <- train$AllUpperCaseProp/train$TokenCount
summary(train$AllUpperCaseProp)

# Examine the potential usefulness of the built feature
ggplot(data = train, 
       mapping = aes(y = AllUpperCaseProp, x = Label, fill = Label)) +
  geom_boxplot(show.legend = FALSE) +
  theme_bw() + ylab("Proportion of all-caps words per message\n")
# Not fully clear, but some distinction (between the spam and ham messages) seems to be present 


###################################################
## REDUCE VARIATION AMONG THE WORDS (TOKEN SET):
## NORMALIZE THE TEXT, REMOVE STOPWORDS, STEM WORDS
###################################################

# Now that we've made use of the distinction of lower vs capital letters,
# we can reduce all tokens to lower letters to reduce the variability of the token set 
# (a part of the process known as text normalization)
train.tokens <- tokens_tolower(train.tokens)
train.tokens[[6]]

# Since the text of SMS messages tends to have a lot of misspelled words, it would 
# be useful to do spelling correction, as a part of the text normalization step.
# Typical approach is to check the text against some of the available 
# misspelling corpora (e.g. http://www.dcs.bbk.ac.uk/~ROGER/corpora.html).

# Use quanteda's built-in stopword list for English.
# NOTE: always inspect stopword lists for applicability to your problem/domain.
length(stopwords())
head(stopwords(), n = 30)
train.tokens <- tokens_remove(train.tokens, stopwords())
train.tokens[[6]]

# Perform stemming on the tokens.
train.tokens <- tokens_wordstem(train.tokens, language = "english")
train.tokens[[6]]

# Save the pre-processed training data and tokens
saveRDS(object = train, file = "data/preprocessed/spam_train_data.RData")
saveRDS(object = train.tokens, file = "data/preprocessed/spam_train_tokens.RData")

###################################
# CREATE DOCUMENT TERM MATRIX (DTM)
###################################

# In quanteda's terminology DTM is referred to as "document feature matrix" - dfm
train.tokens.dfm <- dfm(train.tokens, 
                        tolower = FALSE) # by default, tolower=TRUE; 
                                         # since, we've already lower cased the tokens, 
                                         # no need to do it again 
train.tokens.dfm

# Transform to a (regular) matrix and inspect
train.tokens.matrix <- as.matrix(train.tokens.dfm)
View(train.tokens.matrix[1:20, 1:100])
# it's very sparse
sparsity(train.tokens.dfm)

# Setup a feature data frame with labels
train.tokens.df <- cbind(Label = train$Label, data.frame(train.tokens.dfm))

########################################################
# BUILD the 1st ML MODEL: RPART + UNIGRAMS + TF WEIGTHS
########################################################

# Per best practices, we will leverage cross validation (CV) for our
# modeling process. In particular, we will perform 10-fold cross validation to
# tune parameters and find best performing models.
# Note that our data set is not trivial in size. As such, CV runs will take 
# quite a long time. To cut down on total execution time, we use
# the doSNOW package to allow for training in parallel on multiple cores.

source("RequiredFunctions.R") 

# Due to the size of the DTM, at this point, we use a single decision
# tree (DT) algorithm to build our first model. We will use more powerful algorithms 
# later when we perform feature reduction to shrink the size of our data set.

library(rpart)
# we will tune the cp parameter, which is considered the most important in the
# rpart function (the one used for building a DT).
# cp stands for the complexity parameter; any split that does not improve the overall
# fit of the model by at least cp is not attempted; default value is 0.01
cpGrid = expand.grid( .cp = seq(from = 0.001, to = 0.005, by = 0.0005)) 
rpart.cv.1 <- cross_validate_classifier(seed,
                                        nclust = 5,
                                        train.data = train.tokens.df,
                                        ml.method = "rpart",
                                        grid.spec = cpGrid)

# Check out our results:
rpart.cv.1
plot(rpart.cv.1)
# best values:
# cp     Accuracy   Kappa    
# 0.001  0.9087824  0.7570515
tf.best.cp <- rpart.cv.1$bestTune$cp
tf.best.results <- with(rpart.cv.1, results[results$cp == tf.best.cp,])

# By inspecting the tree, we can get an idea of the terms (features)
# the DT model selected as the most important
print(rpart.cv.1$finalModel)

################################
## APPLY TF-IDF WEIGHTING SCHEME
################################

# The DTM used in the first model is based on the counts of terms, often known as 
# term frequency (TF). However, this is not always the best metric for estimating 
# the relevance of a word in a corpus.
# The Term Frequency-Inverse Document Frequency (TF-IDF) metric tends to provide
# better results. Specifically, TF-IDF accomplishes the following goals:
# 1 - The TF metric does not account for the fact that longer documents will have 
#     higher individual term counts. By normalizing TF values, using, for example, 
#     L1 norm, that is, the document length expressed as the number of words,
#     we get a metric that is length independent.
# 2 - The IDF metric accounts for the frequency of term appearance in all documents 
#     in the corpus. The intuition being that a term that appears in almost every  
#     document has practically no predictive power.
# 3 - The multiplication of normalized TF by IDF allows for weighting each term 
#     based on both its specificity at the level of the overall corpus (IDF) and its 
#     specificity for a particular document (ie. relatively high presence in the document).


# First step, normalize all documents via TF
train.tokens.tf <- apply(train.tokens.matrix, 1, relative.term.frequency)
dim(train.tokens.tf)
# note that the matrix has been transposed
View(train.tokens.tf[1:20, 1:100])

# Second step, calculate the IDF vector 
train.tokens.idf <- apply(train.tokens.matrix, 2, inverse.doc.freq)
str(train.tokens.idf)
# note that the result is a vector with IDF value for each term

# Lastly, calculate TF-IDF
train.tokens.tfidf <-  apply(train.tokens.tf, 2, # tf.idf f. is applied to columns, as in the given matrix, each column corresponds to one document
                             tf.idf, idf = train.tokens.idf)
dim(train.tokens.tfidf)
View(train.tokens.tfidf[1:25, 1:25])

# Transpose the TF-IDF matrix
train.tokens.tfidf <- t(train.tokens.tfidf)
View(train.tokens.tfidf[1:25, 1:25])


# Check for incomplete cases. 
# These would be the cases where normalization of TF values produced NaNs. 
# That would be the case with messages that were stripped off all their tokens 
# during the pre-processing (messages that consisted only of stop-words, 
# numbers and/or symbols).
which(!complete.cases(train.tokens.tfidf))
# No such cases

# Make a clean data frame using the same process as before
train.tokens.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.tfidf))

############################################################
# BUILD the 2nd ML MODEL: RPART + UNIGRAMS + TF-IDF WEIGHTS
############################################################

# Build a classifier through CV, using the new version of the DTM
rpart.cv.2 <- cross_validate_classifier(seed,
                                        nclust = 5,
                                        train.data = train.tokens.tfidf.df,
                                        ml.method = "rpart",
                                        grid.spec = cpGrid)

# Check out our results:
rpart.cv.2
plot(rpart.cv.2)
# note that even though values 0.02 and 0.015 produced the same accuracy and kappa
# values, 0.02 was selected as it leads to a (bit) less complex tree
tfidf.best.cp <- rpart.cv.2$bestTune$cp
tfidf.best.results <- with(rpart.cv.2, results[results$cp == tfidf.best.cp,])

# examine the terms (features) used for building the model
print(rpart.cv.2$finalModel)

# compare the results
data.frame(rbind(tf.best.results, tfidf.best.results), 
           row.names = c("TF", "Norm_TF-IDF"))

# The results are better, especially, with respect to the Kappa metric.
# Kappa is more relevant than Accuracy when the dataset is imbalanced
# (one class far more present than the other(s)), as is the case with our dataset.


#####################################
## ADDING BIGRAMS TO THE FEATURE SET
#####################################

# N-grams allow us to augment our DTM matrix with word ordering.
# This often leads to increased performance (e.g., accuracy)
# over ML models trained with unigrams only. 
# Let's add bigrams to our training data and apply TF-IDF metric to
# the expanded DTM to see if accuracy would improve.

# Add bigrams to the feature matrix
train.tokens.2 <- tokens_ngrams(train.tokens, n = 1:2)
train.tokens.2[[6]]

# Transform to dfm and then a matrix
train.tokens.2.dfm <- dfm(train.tokens.2, tolower = FALSE)
train.tokens.2.dfm
# note the number of features and the level of sparsity
train.tokens.2.matrix <- as.matrix(train.tokens.2.dfm)

# Compute relative (normalized) TF
train.tokens.2.tf <- apply(train.tokens.2.matrix, 1, relative.term.frequency)

# Calculate the IDF vector (to be used for training and test data)
train.tokens.2.idf <- apply(train.tokens.2.matrix, 2, inverse.doc.freq)

# Calculate TF-IDF for our training corpus 
train.tokens.2.tfidf <-  apply(train.tokens.2.tf, 2, 
                               tf.idf, idf = train.tokens.2.idf)

# Transpose the matrix
train.tokens.2.tfidf <- t(train.tokens.2.tfidf)

# Check for incomplete cases
which(!complete.cases(train.tokens.2.tfidf))

# Make a clean data frame
train.tokens.2.tfidf.df <- cbind(Label = train$Label, data.frame(train.tokens.2.tfidf))

# Build a CV-ed model with the new feature set
# rpart.cv.3 <- cross_validate_classifier(seed, nclust = 5,
#                                         train.data = train.tokens.2.tfidf.df,
#                                         ml.method = "rpart",
#                                         grid.spec = cpGrid)

# Could not be built on my computer due to the huge number of features (columns)
# 17,470. In particular, the problem is the result of the formula expansion 
# (for rpart) exceeding memory/recursion stack size limits. It can be ignored as
# in practice, we would either: i) use SVD to reduce the number of dimensions
# (as we do next); or ii) use an algorithm that doesn't require the use of the 
# formula interface (e.g., random forest, SVM, etc.)


####################################
# SINGULAR VALUE DECOMPOSITION (SVD)
# FOR REDUCING THE FEATURE SPACE
####################################

# Use Singular Value Decomposition (SVD) to reduce the number of features (terms) 
# to a smaller set of those that could be expected to be the most useful 
# (ie. explain a large portion of variability in the data)

# Suggested reading for SVD and Latent Semantic Analysis, which is, in fact,
# SVD applied to text analytics:
# - Landauer, T. K., Foltz, P. W., & Laham, D. (1998). Introduction to Latent 
#   Semantic Analysis. Discourse Processes, 25, 259-284. 
#   URL: http://lsa.colorado.edu/papers/dp1.LSAintro.pdf


# We'll use the irlba R package for SVD. It allows for specifying
# the number of the most important singular vectors we wish to calculate
# and retain for features (in SVD terms, the rank we want to reduce the
# original matrix to).
library(irlba)

# Time the code execution
start.time <- Sys.time()

# Perform SVD. Specifically, reduce the dimensionality down to 300 columns.
# 300 is chosen as the number that is often recommended (based on the experience in practice);
# to get the best results, the number of dimensions has to be experimentally determined
# (try several different values and compare the performance of the resulting models)
train.irlba <- irlba(t(train.tokens.2.tfidf), # it is transposed as SVD / LSA require TDM as an input 
                     nv = 300,  # number of singular vectors to estimate
                     maxit = 600) # maxit is set to be twice larger than nv 

# Total time of execution on workstation was 
total.time <- Sys.time() - start.time
total.time

## How to determine the "right" number of singular vectors is still an open issue.
# Some useful links on that topic (in the descending order of perceived usefulness):
# - https://stackoverflow.com/questions/9582291/how-do-we-decide-the-number-of-dimensions-for-latent-semantic-analysis 
# - https://irthoughts.wordpress.com/2008/02/13/lsi-how-many-dimensions-to-keep/
#

# Examine the result:
str(train.irlba)
# d - corresponds to singular values (values on the diagonal of the sigma matrix)
# u - corresponds to the left singular vector and respresents 
#     higher level concepts extracted from terms
# v - corresponds to the right singular vector and respresents 
#     higher level concepts extracted from documents

# Take a look at the new feature set (the right singular vector, v) up close
dim(train.irlba$v)
# instead of n-grams, each document is represented using the features (singular values)
# extracted by applying the SVD procedure over the original feature set
View(train.irlba$v[1:20,1:50])

# Examine the predictive power of the model with singular vectors as features

#############################################################################
# BUILD the 4th ML MODEL: RPART + SINGULAR VECTORS (FROM TF-IDF WEIGHTED TDM)
#############################################################################

# Create new feature data frame using our document semantic space of 300
# features (the V matrix produced by SVD)
train.svd <- data.frame(Label = train$Label, train.irlba$v)

# build a DT-model with an expanded grid search space - as we now have
# significantly smaller number of features, CV will be far more efficient 
cpGrid.2 = expand.grid( .cp = seq(from = 0.001, to = 0.01, by = 0.0005)) 
rpart.cv.4 <- cross_validate_classifier(seed, 
                                        nclust = 5,
                                        train.data = train.svd,
                                        ml.method = "rpart",
                                        grid.spec = cpGrid.2)

# Check out the results
rpart.cv.4
plot(rpart.cv.4)
svd.best.cp <- rpart.cv.4$bestTune$cp
svd.best.results <- with(rpart.cv.4, results[results$cp==svd.best.cp,])

# compare the results
comparison <- data.frame(rbind(tf.best.results, tfidf.best.results, svd.best.results),
                         row.names = c("TF", "Nom_TF-IDF", "SVD"))
comparison$NFeatures <- c(ncol(train.tokens.df),
                          ncol(train.tokens.tfidf),
                          ncol(train.svd))
comparison
# We got somewhat weaker performance both in terms of Accuracy and Kappa; 
# however, note that we have almost 15 times less features, which makes our model
# more robust, that is, less prone to overfitting 


####################################################
## USING THE (SVD) REDUCED FEATURE SET WITH NEW DATA
## (when the built predictive model is to be applied 
## on new SMS messages to detect spam)
####################################################

# As with TF-IDF, we will need to project any new data (e.g., the test data)
# into the SVD space. The formula to be used for that projection: 
# document_hat = sigma_inverse * transposed_U_matrix %*% document_TF-IDF_vector
#
sigma.inverse <- 1 / train.irlba$d
u.transpose <- t(train.irlba$u)
# as an example, let's use the first document from the training set 
# (that is, the TF-IDF representation of the first document)
example.doc <- as.vector(train.tokens.2.tfidf[1,])
# the projection of the document in the SVD space:
example.doc.hat <- sigma.inverse * u.transpose %*% example.doc
# Look at the first 10 components of projected document...
example.doc.hat[1:10]
# ... and the corresponding row in the document space produced by SVD (the V matrix)
train.irlba$v[1, 1:10]
# the two are highly similar (note the values are expressed in e-16, e-18,...); 
# in fact, the differences are so tiny that when we compute 
# cosine similarity between the two vectors, the similarity turns to be equal to 1.
library(lsa)
cosine(as.vector(example.doc.hat), as.vector(train.irlba$v[1,]))
#
# Why is this useful?
# It shows that using the above given formula, we can transform any document into
# singular vector space using the computed sigma_inverse and transposed_U_matrix; 
# this further means that we can take a new, unseen document,
# compute TF-IDF values for it and transform it into singular vector space
# so that it can be run through, that is, classified in our prediction model.


## TASK: EXAMINE AN ALTERNATIVE APPROACH OF DATA PREPARATION FOR SVD / LSA
## 
## According to (Landauer, Foltz, & Laham, 1998):
##    "Before the SVD is computed, it is customary in LSA to subject the data 
##    in the raw word-by-context matrix to a two-part transformation. 
##    First, the word frequency (+ 1) in each cell is converted to its log. 
##    Second, the information-theoretic measure, entropy, of each word is computed as:
##    -p*logp over all entries in its row, and each cell entry then divided by 
##    the row entropy value. 
##    The effect of this transformation is to weight each word-type occurrence directly 
##    by an estimate of its importance in the passage and inversely with the degree to 
##    which knowing that a word occurs provides information about which passage [document]
##    it appeared in."
##
## So, instead of TF-IDF, transform the original DTM (train.tokens.2.dfm) in the 
## manner suggested above, apply SVD on the thus transformed DTM, and build 
## an rpart model, as done above. Compare the results with those of rpart.cv.4. 


###############################################
# BUILD the 5th ML MODEL: RANDOM FOREST + 
# SINGULAR VECTORS (FROM TF-IDF WEIGHTED TDM)
###############################################

# We have reduced the dimensionality of our data using SVD. Now, we can use a more 
# complex and powerful classification algorithm. In particular, we will build a 
# Random Forest (RF) model.

# For a brief introduction to the Random Forest algorithm, see the lecture slides.
# For more details and an excellent explanation of Random Forest and related algorithms,
# see chapter 8.2 of the Introduction to Statistical Learning book.

# We will build RF with the default of 500 trees. We'll also ask try different 
# values of the mtry parameter (the number of variables randomly sampled as 
# candidates at each split) to find the mtry value that gives the best result.

# NOTE - The following code takes a long time to run. Here's the math.
#        We are performing 5-fold CV. That means we will examine each 
#        model configuration 5 times. We are also asking caret to try 
#        several (7) different values of the mtry parameter. 
#        Next, by default RF builds 500 trees. Lastly, caret will
#        build 1 final model at the end of the process with the best 
#        mtry value over all the training data. So, the number of 
#        tree we're building is:
#
#             (5 * 7 * 500) + 500 = 18,000 trees!
#
mtryGrid = expand.grid( .mtry = seq(from = 1, to = ncol(train.svd), length.out = 7))
rf.cv.1 <- cross_validate_classifier(seed, nclust = 5,
                                     train.data = train.svd,
                                     ml.method = "rf",
                                     grid.spec = mtryGrid)

# Check out the results
rf.cv.1
plot(rf.cv.1)
# save the model
saveRDS(rf.cv.1, "models/rf.cv.1.RData")

# Examine the results in more detail
rf.cv.1.eval <- confusionMatrix(reference = train.svd$Label, 
                                data = rf.cv.1$finalModel$predicted)
rf.cv.1.eval

# Compare the results with the previously CV-ed models
comparison <- data.frame(rbind(comparison[,2:6], 
                               c(rf.cv.1.eval$overall[1:4], ncol(train.svd))),
                         row.names = c(row.names(comparison), "RF_SVD"))
comparison
# Obviously, the use of a more powerful algorithm significantly improved the results, 
# in terms of accuracy and kappa; on the other hand, SD has incrased, as well. 



###############################################
# BUILD the 6th ML MODEL: RANDOM FOREST + 
# SINGULAR VECTORS (FROM TF-IDF WEIGHTED TDM) +
# "HAND-CRAFTED" DENSE FEATURES 
###############################################

# Now let's add the features we engineered previously
# to check if they will improve the model
str(train)

# Add TextLength, NumCount, AllUpperCaseProp, and HasURL
# to the feature set:
train.svd <- cbind(train.svd, 
                   TextLength=train$TextLength, 
                   NumCount=train$NumCount, 
                   AllUpperCaseProp=train$AllUpperCaseProp,
                   HasURL=train$HasURL)
tail(colnames(train.svd), n=10)

# build a model with the new features added
rf.cv.2 <- cross_validate_classifier(seed, 
                                     nclust = 5,
                                     train.data = train.svd,
                                     ml.method = "rf",
                                     grid.spec = mtryGrid)

# Again, as the model building will take too much time,
# we'll save the model to have it available for later
saveRDS(rf.cv.2, "models/rf.cv.2.RData")

# Examine the results
rf.cv.2
plot(rf.cv.2)

# Drill-down on the results
rf.cv.2.eval <- confusionMatrix(reference = train.svd$Label, 
                                data = rf.cv.2$finalModel$predicted)
rf.cv.2.eval

# Compare the results with the previously CV-ed models
comparison <- data.frame(rbind(comparison, 
                               c(rf.cv.2.eval$overall[1:4], ncol(train.svd))),
                         row.names = c(row.names(comparison), "RF_SVD_DenseFeat"))
comparison
# There is a notable improvement in both the Accuracy and Kappa metrics

# The added features are obviously important, but how important are they?

library(randomForest)
# The following plots show the mean decrease in the Gini index if a particular 
# feature is not included in the model. The larger the decrease, the more 
# relevant the feature is.
# Gini index measures the total decrease in node impurity from splitting 
# on the feature, averaged over all the trees
varImpPlot(rf.cv.1$finalModel, type = 2)
varImpPlot(rf.cv.2$finalModel, type = 2)

# Turns out that NumCount, AllUpperCaseProp, and TextLength features are very predictive; 
# HasURL is also among the top 15 (out of 305) features. 

#######################################################
## CREATE AND USE AN ADDITIONAL DENSE FEATURE
## (based on the COSINE SIMILARITY with spam messages)
#######################################################

# We can use cosine similarity to engineer a feature for estimating how similar
# each SMS text message is to all the spam SMS messages.
# The hypothesis here is that our use of bigrams, tf-idf, and SVD has 
# produced a representation where ham SMS messages should have low similarity
# with spam SMS messages and vice versa.

# Use the lsa package's cosine function for our calculations
library(lsa)

?cosine
# The cosine f. calculates cosine similarity between all columns of a matrix.
# Since we need to compute similarity between SMS messages that are given in rows
# of our DTM, we need to transpose the matrix. In addition, as we want to compute 
# similarites based on the singular vector representation of messages, before 
# transposing the matrix, we should remove the columns that correspond to our 
# "custom-made, dense" features.
cols.to.remove <- c(1,302:305)
tdm.svd <- t(as.matrix(train.svd[,-cols.to.remove]))
dim(tdm.svd)
train.similarities <- cosine(tdm.svd)

# Next, take each SMS text message and find the mean cosine 
# similarity between that message and the spam SMS messages.
# Per our hypothesis, ham SMS text messages should have (relatively) low
# cosine similarities with spam messages and vice versa

# Start by identifying spam messages
spam.indices <- which(train$Label == "spam")

# add a new feature based on the computed cosine similarity
train.svd$SpamSimilarity <- rep(0.0, nrow(train.svd))
for(i in 1:nrow(train.svd)) {
  train.svd$SpamSimilarity[i] <- mean(train.similarities[i, spam.indices])  
}
summary(train.svd$SpamSimilarity)

# As before, let's visualize the distribution of the new feature to estimate 
# its potential predictive power
ggplot(train.svd, aes(x = SpamSimilarity, fill = Label)) +
  theme_bw() +
  geom_histogram(binwidth = 0.05) +
  labs(y = "Message Count",
       x = "Mean Spam Message Cosine Similarity",
       title = "Distribution of Ham vs. Spam Using Spam Cosine Similarity")
# The plot suggests a useful feature

# Do another CV round using the new spam cosine similarity feature
rf.cv.3 <- cross_validate_classifier(seed, 
                                     nclust = 5,
                                     train.data = train.svd,
                                     ml.method = "rf",
                                     grid.spec = mtryGrid)

# Save the results
saveRDS(rf.cv.3, "models/rf.cv.3.RData")

# Examine the results
rf.cv.3
plot(rf.cv.3)

# Drill-down on the results
rf.cv.3.eval <- confusionMatrix(reference = train.svd$Label, 
                                data = rf.cv.3$finalModel$predicted)
rf.cv.3.eval
# Note that the number of FNs (predicting spam for ham messages) has slightly decreased
# (from 15 in rf.cv.2 to 13 in rf.cv.3). This is desirable in spam detection as 
# it means that we have reduced the number of non spam that will be classified as
# spam and thus not shown to the user.

# compare the results with the previously CV-ed models
comparison <- data.frame(rbind(comparison, 
                               c(rf.cv.3.eval$overall[1:4], ncol(train.svd))),
                         row.names = c(row.names(comparison), "RF_SVD_DenseFeat2"))
comparison
# We have a small improvement in both Accuracy and Kappa.
# Considering these results, as well as the decrase in FNs, the current model (rf.cv.3) 
# could be considered better then the previous 

# How important was the new feature?
varImpPlot(rf.cv.3$finalModel)
# It seems to be highly significant

##################
# TEST THE MODELS
##################

# We've built what appears to be an effective predictive model. Time to verify
# it using the test holdout data we set aside at the beginning of the project.
# The first stage of this verification is running the test data through our 
# pre-processing pipeline of:
# - Removal / replacement of 'strange' characters
# - Tokenization
# - Computing 'custom' (dense) features
# - Lower casing
# - Stopword removal
# - Stemming
# - Adding bigrams
# - Transforming tokens to DTM
# - Ensuring the test DTM has same features (tokens) as the train DTM

# 1) Removal / replacement of 'strange' characters

# replace escaped ampersen occurrences with "and"
test$Text <- replaceEscAmp(test$Text)

# remove HTML-escaped '<' and '>' characters
test$Text <- removeLtGtChars(test$Text)

# replace urls with URL term
test$Text <- sapply(test$Text, replaceURL)

# 2) Tokenization
test.tokens <- tokens(test$Text, what = "word", 
                      remove_punct = TRUE, remove_symbols = TRUE)

#
# 3) Computing 'custom' features
#
# 3.1) a feature that is the count of numbers in a message
test$NumCount <- sapply(X = test.tokens,
                         FUN = function(x) ifelse(length(x) == 0, yes = 0,
                                                  no = numbers_count(x)))

# Remove numbers from the token set
test.tokens <- tokens_remove(x = test.tokens, 
                             pattern = "^[0-9]+$", valuetype = "regex")

# Remove tokens with one character only
test.tokens <- tokens_keep(x = test.tokens, min_nchar = 2)


# 3.2) a feature that is the proportion of tokens with all capital letters
test$TokenCount <- sapply(X = test.tokens, length)
test$AllUpperCaseProp <- sapply(X = test.tokens,
                                 FUN = function(x) 
                                   ifelse(length(x) == 0, 
                                          yes = 0,
                                          no = all_upper_case_count(x)))
test$AllUpperCaseProp <- test$AllUpperCaseProp/test$TokenCount

# 3.3) the presence of a URL as a feature
test$HasURL <- as.factor(sapply(test.tokens, 
                                function(x) ifelse(msg_has_URL(x) > 0, 
                                                   yes = "Yes", no = "No")))


# 4) Lower case the tokens
test.tokens <- tokens_tolower(test.tokens)

# 5) Stopword removal
test.tokens <- tokens_remove(test.tokens, stopwords())

# 6) Stemming
test.tokens <- tokens_wordstem(test.tokens, language = "english")

# Store the pre-processed test data and tokens
# saveRDS(object = test, file = "data/preprocessed/spam_test_data.RData")
# saveRDS(object = test.tokens, file = "data/preprocessed/spam_test_tokens.RData")

# 7) Add bigrams
test.tokens <- tokens_ngrams(test.tokens, n = 1:2)

# 8) Convert tokens to DTM (in quanteda: dfm) matrix
test.tokens.dfm <- dfm(test.tokens, tolower = FALSE)

# Compare the train and test quanteda dfm objects
train.tokens.2.dfm
test.tokens.dfm

# 9) Ensure the test dfm has the same n-grams as the training dfm
#
# NOTE - In production we should expect that new text messages will 
#        contain n-grams that did not exist in the original training
#        data. As such, we need to represent any new message in the 
#        feature space of the training set, that is, the feature space
#        our classifier is 'familiar with' 
#
test.tokens.dfm <- dfm_select(test.tokens.dfm, 
                              pattern = train.tokens.2.dfm,
                              selection = "keep")
test.tokens.dfm
# now, test dfm has the same number of features as the train dfm
# let's check if those are really the same features
test.tokens.matrix <- as.matrix(test.tokens.dfm)
setdiff(colnames(test.tokens.matrix), colnames(train.tokens.2.matrix)) 
# no difference -> they are exactly the same

# With the raw test features in place, the next step is to 'project' 
# the test features into the same TF-IDF vector space we built for our 
# training data. This requires the following steps:
#      1 - Normalize term counts in each document (i.e, each row)
#      2 - Perform IDF multiplication using training IDF values
#         (we use the IDF values from the training set, as we represented
#         our test data using the features (ngrams) of the training set)

# Normalize term counts in all test messages
test.tokens.tf <- apply(test.tokens.matrix, 1, relative.term.frequency)
str(test.tokens.tf)

# Next, calculate TF-IDF using IDF for our training corpus
test.tokens.tfidf <-  apply(test.tokens.tf, 2, tf.idf, idf = train.tokens.2.idf)
dim(test.tokens.tfidf)

# Transpose the matrix (so that the documents are in the rows)
test.tokens.tfidf <- t(test.tokens.tfidf)
View(test.tokens.tfidf[1:25, 1:25])

# Check for incomplete cases
incomplete.cases <- which(!complete.cases(test.tokens.tfidf))
View(test.tokens.tfidf[incomplete.cases, 1:25])
# Replace NaNs with zeros 
test.tokens.tfidf[incomplete.cases,] <- rep(0.0, times = ncol(test.tokens.tfidf))
which(!complete.cases(test.tokens.tfidf))


# With the test data projected into the TF-IDF vector space of the training
# data, we can now do the final projection into the training LSA space
# (i.e. apply the SVD matrix factorization).
# Reminder: we said that the following formula can be used to represent a 
# document in the LSA space:
# document_hat = sigma_inverse * transposed_U_matrix %*% document_TF-IDF_vector
# If we have multiple documents, as is now the case, instead of document_TF-IDF_vector,
# we will have a matrix of TF-IDF values, with documents in columns and terms in rows
test.svd.raw <- sigma.inverse * u.transpose %*% t(test.tokens.tfidf)
dim(test.svd.raw)

# With the feature set ready, we can now build the test data frame to 
# feed into our trained machine learning model for predictions. 
# Before that, we should add Label and the four custom dense features
# (TextLength, NumCount, AllUpperCaseProp, and HasURL).
test.svd <- data.frame(Label = test$Label, 
                       t(test.svd.raw), # need to transpose it, to place documents in rows
                       TextLength = test$TextLength,
                       NumCount = test$NumCount,
                       AllUpperCaseProp = test$AllUpperCaseProp, 
                       HasURL = test$HasURL)

# Now we can make predictions on the test data set using our trained  
# random forest model (rf.cv.2)
preds <- predict(rf.cv.2, newdata = test.svd)

# Examine the results
rf.2.test.eval <- confusionMatrix(data = preds, reference = test.svd$Label)
rf.2.test.eval

# Let's remind ourselves on the CV results 
rf.cv.2.eval

# Compare test and CV results w.r.t. precision, recall, F1
rf.cv.2.eval$byClass[c("Precision", "Recall", "F1")]
rf.2.test.eval$byClass[c("Precision", "Recall", "F1")]
# almost the same CV results

# Check for FNs
rf.2.test.eval$table
# very low FNs, which is, of course, excellent!

#
# Examine also the performance of the RF model with the SpamSimilarity feature (rf.cv.3)
#

# Calculate SpamSimilarity feature for all the messages from the test set. 
#
# First, create a spam similarity matrix. To that end, add to the existing test feature 
# set (test.svd.raw) singular vector representation of spam messages from the training
# set (rows of the V matrix that correspond to the spam messages)
# 
test.svd.raw <- t(test.svd.raw)
test.similarities <- rbind(test.svd.raw, train.irlba$v[spam.indices,])
dim(test.similarities)
test.similarities <- cosine(t(test.similarities)) 

# Now, create SpamSimilarity feature by computing the mean value of similarity  
# between each message from the test set and each spam message from the training set
test.svd$SpamSimilarity <- rep(0.0, nrow(test.svd))
spam.cols <- (nrow(test.svd) + 1):nrow(test.similarities)
for(i in 1:nrow(test.svd)) {
  test.svd$SpamSimilarity[i] <- mean(test.similarities[i, spam.cols])  
}
summary(test.svd$SpamSimilarity)

# Note a few NA values for the SpamSimilarity feature. Here is why:
# some SMS messages became empty as a result of stopword and special 
# character removal, or because their words were not present in the
# training set (hence not present in the dfm). 
# We'll replace NAs with zeros.
test.svd$SpamSimilarity[is.na(test.svd$SpamSimilarity)] <- 0


# Now we can make predictions on the test data set using our final  
# random forest model (rf.cv.3).
preds <- predict(rf.cv.3, newdata = test.svd)

# Examine the results
rf.3.test.eval <- confusionMatrix(data = preds, reference = test.svd$Label)
rf.3.test.eval

# Compare the test results of the rf.cv.2 and rf.cv.3 models 
metrics <- c("Precision", "Recall", "F1")
test.compare <- data.frame(rbind(rf.2.test.eval$byClass[metrics], 
                                 rf.3.test.eval$byClass[metrics]),
                           row.names = c("RF_2", "RF_3"))
test.compare <- cbind(test.compare, 
                      rbind(rf.2.test.eval$overall[c("Accuracy", "Kappa")],
                            rf.3.test.eval$overall[c("Accuracy", "Kappa")]))

test.compare

# Our final model (RF.CV.3) proved to be the best on the test set, 
# according to all the metrics  

