getwd()

#import data
train.data <- read.csv("data/titanic/train.csv", stringsAsFactors = F)
test.data <- read.csv("data/titanic/test.csv", stringsAsFactors = F)

##################################
######  EXPLORING THE DATA  ######
##################################

# Let's start by examining structure of the training dataset
# Note: description of all the varibles is available at the Kaggle website:
# http://www.kaggle.com/c/titanic-gettingStarted/data
str(train.data)

# We can also take a look at the first couple of rows
head(train.data)

# Let's see the structure of the test dataset, as well
# the structure is almost exactly the same as of the training set, 
# except that the Survived column is missing
str(test.data)

# Let's now check if the data is complete, i.e., if there is some missing values.
# The easiest way to get a high level view on the data completeness is to visualize the data
# To that end, we'll use the Amelia package; first, we install the package  
# install.packages("Amelia")
# and the we load it 
library(Amelia)
# we now structure the display area to show two plots in the same row  
par(mfrow = c(1,2))
# printing graphs of missing data for the training and the test sets
missmap(obj = train.data, main = "Missing data in the training set", legend = F)
missmap(obj = test.data, main = "Missing data in the test set", legend = F)
# reverting plotting area to the default (one plot per row)
par(mfrow = c(1,1))

# there's a lot of missing data for the Age variable; let's see that more precisely
summary( train.data$Age )
summary( test.data$Age )

# there is also one missing Fare value in the test set; let's see which one
which( is.na(test.data$Fare) )
# it's the 153rd instance; let's see it 
test.data[153, ]

# we previously noticed that in some observations the value for the Cabin variable is missing;
# that was not visible in the previous plots, as the detection of missing values was 
# based on the NA values
# let's check how many missing values we have for the Cabin in both datasets
length( which(train.data$Cabin == "") )
length( which(test.data$Cabin == "") )

# some of these missing values are due to the fact that passangers from the
# 2nd and the 3rd class did not have a cabin (not 'real' missing values); 
# however, those from the 1st class must have had a cabin (these would be 'real' missing values); 
# let's check
nrow(train.data[train.data$Cabin == "" & train.data$Pclass == 1,])
# for 40 1st class passengers in the training set the cabin value is missing
nrow(test.data[test.data$Cabin == "" & test.data$Pclass == 1,])
# and for 27 1st class passangers from the test set

# so, let's replace the missing Cabin values for 1st class passangers with NAs 
train.data[ (train.data$Cabin == "") & (train.data$Pclass == 1), "Cabin"] <- NA
test.data[ (test.data$Cabin == "") & (test.data$Pclass == 1), "Cabin"] <- NA

# let's also check other string variables for missing values 'hidden' as empty strings
length(which(train.data$Name==""))
# no missing values
length(which(train.data$Ticket==""))
# no missing values
length(which(train.data$Embarked==""))
# 2 missing values
# Embarked is essentially a nominal variable with 3 possible values ('S', 'C', and 'Q')
# in situation like this, the missing values are replaced by the 'majority class',
# that is, the most dominant value
xtabs(~Embarked, data = train.data)
train.data[train.data$Embarked=="","Embarked"] <- 'S'
train.data$Embarked <- factor(train.data$Embarked)


# Let's replace the missing Fare value with the average value of the Fare variable
# for the passengers of the same Pclass 
# this is a typical technique used for imputing the missing values 
# (imputation is the process of replacing missing data with substituted values) 
# first, check the distribution of the Fare variable, to decide if we're using 
# mean or median as the average value
shapiro.test(test.data$Fare)
# not normaly distributed -> use median
# now, find the observation (passanger) with the missing Fare value
missing.fare.index <- which( is.na(test.data$Fare) )
# identify the passenger's class
missing.fare.pclass <- test.data$Pclass[missing.fare.index]
# compute median of the Fare variable for the other passangers of the same class
median.fare <- median(x = test.data$Fare[test.data$Pclass == missing.fare.pclass], na.rm = T)
test.data$Fare[missing.fare.index] <- median.fare
# check if the NA value was really repalced
summary(test.data$Fare)

# since we should predict the survival of the passangers, 
# let's see what it looks like in the training set
table( train.data$Survived )
# we can see this as percentages
round(prop.table( table(train.data$Survived)), digits = 4)

# It's well-known that in disasters woman and children are often the first to be rescued
# Let's check if that was the case in the Titanic disaster
# We'll start by looking at the survival based on the gender;
# first, let's see the proportion of males and females in the dataset
train.data$Sex <- factor(train.data$Sex)
summary( train.data$Sex )
# let's see the survival counts based on the gender
xtabs(~Sex + Survived, data = train.data)
# and the proportions
sex.surv.tbl <- round(prop.table(xtabs(~Sex + Survived, data = train.data), margin = 1), digits = 4)

# This is quite informative, but we might better appreciate the data if it is presented visually
# For drawing graphs, we'll use the ggplot2 library
library(ggplot2)

# recall that the Survived varible is of the type int; we can check that with: 
class( train.data$Survived )
# for plotting the survival against the gender of the passengers, we need to 
# transform the Survived variable into a factor variable (like Enum in Java)
train.data$Survived <- factor(x = train.data$Survived, levels = c(0,1), labels = c("No", "Yes"))

# plot survival against the gender 
ggplot(train.data, aes(x = Sex, fill = Survived)) +
  geom_bar(stat="count", width = 0.4, position = 'dodge') +
  xlab("Gender") +
  ylab("Total Count") +
  labs(fill = "Survived")

# let's see the same variables but expressed in percentages
ggplot(as.data.frame(sex.surv.tbl), aes(x=Sex, y=Freq*100, fill=Survived)) +
  geom_bar(stat='identity', width=.4, position = 'dodge') +
  xlab("Gender") +
  ylab("Percentage") +
  ylim(c(0,100)) +
  labs(fill = "Survived") +
  theme_bw()

# Before inspecting if/how age have affected the chances for survival, let's quickly take a look
# at the potential impact of the passanger class (1st, 2nd or 3rd)
# Again, we first have to transform the variable into a factor
train.data$Pclass <- factor(x = train.data$Pclass, 
                            levels = c(1,2,3), 
                            labels = c("1st", "2nd", "3rd"))

# let's now plot the survival against gender and the passanger class
ggplot(train.data, aes(x = Sex, fill = Survived)) +
  facet_wrap(~Pclass) +
  geom_bar(width = 0.4) +
  xlab("Gender") +
  ylab("Total Count") +
  labs(fill = "Survived") +
  ggtitle("Passanger class") +
  theme_bw()


# let's also inspect if the place of embarkment (the Embarked variable)
# affected the survival
ggplot(train.data, aes(x = Embarked, fill = Survived)) +
  geom_bar(width = 0.4, position = 'dodge') +
  xlab("Place of embarkment") +
  ylab("Total Count") +
  labs(fill = "Survived") +
  theme_bw()

# let's see the same variables but expressed in percentages
emb.surv.tbl <- prop.table(table(Embarked=train.data$Embarked, Survived=train.data$Survived), margin = 1)
ggplot(as.data.frame(emb.surv.tbl), aes(x=Embarked, y=Freq*100, fill=Survived)) +
  geom_bar(stat='identity', width=.4, position = 'dodge') +
  xlab("Place of embarkment") +
  ylab("Percentage") +
  ylim(c(0,100)) +
  labs(fill = "Survived") +
  theme_bw()

###################################
######  FEATURE ENGINEERING  ######
###################################
# When creating new features (attributes) to be used for prediction purposes, 
# we need to base those features on the data from both the training and the test sets, 
# so that the features are available both for training the prediction model, 
# and making predictions on the unseen test data.
# Hence, we will merge the training and the test sets and develop new features on the merged data

# first, assure that the training and the test sets have exactly the same structure
# to that end, we add the Survived column to the test data
test.data$Survived <- factor(NA, # NA is like null in Java
                             levels = c(0,1), 
                             labels = c("No", "Yes")) 
# transform the Pclass, Sex and Embarked variables in the test set into factors, since we've done 
# that in the training set (the structure should be exactly the same)  
test.data$Pclass <- factor(x = test.data$Pclass, levels = c(1,2,3), labels = c("1st", "2nd", "3rd"))
test.data$Sex <- factor(test.data$Sex)
test.data$Embarked <- factor(test.data$Embarked)
str(test.data)
# now, we can merge the two datasets
merged.data <- rbind(train.data, test.data)


# Recall that the Age variable has a lot of missing values 
nrow( merged.data[ is.na(merged.data$Age), ]) / dim( merged.data )[1]
# 20% of all observations are missing the Age value; none of the previously considered 
# imputation techniques would produce trustworthy replacements; 
# so, we need to find another way to determine 
# or at least approximate age groups; 
# we'll do that by making use of the Name variable

# To start, let's first inspect the values of this variable
merged.data[1:10, "Name"] # names of the first ten passangers 

# an observation regarding the Name variable:
# it consists of surname, title, first name, and in some cases additional name 
# (maiden name of married woman)
# So, the idea is to use the title of a person as a rough proxy for his/her age

# first, we need to extract title from the Name variable; 
# to that end, we'll split the Name string using "," and "." as delimiters 
# lets' try it first
strsplit(x = merged.data$Name[1], split = "[,|.]")
# we get a list of vectors, where each vector consists of pieces of a person's name
# to extract the title, we need to first get the vector (made out of the person's name)
strsplit(x = merged.data$Name[1], split = "[,|.]")[[1]]
# and then, take the second element of that vector:
strsplit(x = merged.data$Name[1], split = "[,|.]")[[1]][2]
# you might have noticed a space before the title, we'll remove that quickly, but before that,
# we'll apply this procedure to all the rows in the merged dataset to create a new feature:
merged.data$Title <- sapply(X = merged.data$Name, 
                            FUN = function(x) { strsplit(x, "[,|.]")[[1]][2] })

# now, let's remove that leading blank space
merged.data$Title <- sapply(X = merged.data$Title, 
                            FUN = function(x) { substr(x, start = 2, stop = nchar(x))} )

# we can now inspect different kinds of titles we have in the dataset
table( merged.data$Title )
# there are some rarely occuring titles that won't be much usefull for creating a model; 
# so, we'll aggregate those titles into broader categories
ladies <- c("Dona", "Lady", "Mme", "Mrs", "the Countess")
merged.data$Title[ merged.data$Title %in% ladies ] <- "Mrs" # the %in% operator checks to see if a value is an element of the given vector 

# we'll also add an additional attribute that will represent age-gender group
merged.data$AgeGroup <- vector(length = nrow(merged.data), mode = "character")
merged.data$AgeGroup[merged.data$Title=="Mrs"] <- "adult_women"

misses = c("Ms", "Mlle")
merged.data$Title[ merged.data$Title %in% misses ] <- "Miss"
merged.data$AgeGroup[merged.data$Title=="Miss"] <- "girls"

gentelman <- c("Capt", "Col", "Don", "Dr", "Major", "Mr", "Rev", "Sir")
merged.data$Title[ merged.data$Title %in% gentelman ] <- "Mr"
merged.data$AgeGroup[merged.data$Title=="Mr"] <- "adult_men"

masters <- c("Master", "Jonkheer")
merged.data[ merged.data$Title %in% masters, "Title"] <- "Master"
merged.data$AgeGroup[merged.data$Title=="Master"] <- "boys"

# let's see how titles and age groups are distributed after the aggregation
table( merged.data$Title )
table( merged.data$AgeGroup )
# we observe unexpectedly high number/percentage of passangers with
# the 'Miss' title, suggesting that this title is not reflective of the
# age of the female passangers; let's inspect that
ggplot(merged.data[merged.data$Title=="Miss",], aes(x=Age)) + geom_density()
# the Miss title, obviously, refers to unmarried women of various ages
# this also affected our age groups, and we need to fix this 

# let's try to fix it by using the Age varible where available
nrow( merged.data[ !is.na(merged.data$Age) & merged.data$Title == "Miss",])
# we have 213 cases where Age is available for passangers with the 'Miss' title
# if Age is avaialble and greater than 18, and the Title is 'Miss', change the AgeGroup to "adult_women"
nrow( merged.data[ !is.na(merged.data$Age) & 
                     (merged.data$Age >= 18) & 
                     merged.data$Title == "Miss",])
# the AgeGroup will be changed in 146 observations
merged.data$AgeGroup[!is.na(merged.data$Age) & (merged.data$Age >= 18) & 
                       (merged.data$Title == "Miss")] <- "adult_women"

# if we take another look at the merged titles...
round(prop.table(table( merged.data$Title )), digits = 4)
# ... we'll observe a high percentage of 'Mr' title (~60%) and a very small percentage of 'Master' (~5%)
# again, we can make use of the Age variable, when available, to try to improve this
nrow( merged.data[ !is.na(merged.data$Age) & merged.data$Title == "Mr",])
# Age is available for 605 'Mr' passangers
# if Age is avaialble and less than 18, and the Title is 'Mr', change the AgeGroup to "boys"
nrow( merged.data[ !is.na(merged.data$Age) & 
                     (merged.data$Age < 18) & 
                     merged.data$Title == "Mr",])
# 29 observations can be changed from "adult_men" to "boys"
merged.data$AgeGroup[!is.na(merged.data$Age) & (merged.data$Age < 18) & 
                       (merged.data$Title == "Mr")] <- "boys"

# let's check the AgeGroup proportions after these modifications
table( merged.data$AgeGroup )
round(prop.table( table( merged.data$AgeGroup ) ), digits = 4)
# this looks more realistic

# we'll transform both Title and AgeGroup into factor variables, 
# so they can be better used for data exploration and prediction purposes
merged.data$Title <- factor(merged.data$Title)
merged.data$AgeGroup <- factor(merged.data$AgeGroup)
summary(merged.data$Title)
summary(merged.data$AgeGroup)


# let's inspect the (presence of) connection between the age groups and survival
# by ploting one against the other 
# note that we are using only the first 891 instances in the merged dataset as these are
# instances from the training set for which we know the outcome (i.e., survival)
ggplot(merged.data[1:891, ], aes(x = AgeGroup, fill = Survived)) +
  geom_bar(width = 0.4, position = 'dodge') +
  xlab("Age group") +
  ylab("Total Count") +
  labs(fill = "Survived") +
  theme_bw()

# let's examine this also as percentages
age.surv.tbl <- prop.table(table(AgeGroup=merged.data[1:891,]$AgeGroup,
                                 Survived=merged.data[1:891,]$Survived), margin = 1)
ggplot(as.data.frame(age.surv.tbl), aes(x = AgeGroup, y = Freq*100, fill=Survived)) +
  geom_bar(width = 0.4, stat = 'identity', position = 'dodge') +
  xlab("Age/gender group") +
  ylab("Percentage") +
  ylim(c(0,100)) +
  labs(fill = "Survived") +
  theme_bw()
# obviously the age/gender group affects survival

# let's check also for the title
title.surv.tbl <- prop.table(table(Title=merged.data[1:891,]$Title,
                                   Survived=merged.data[1:891,]$Survived), margin = 1)
ggplot(as.data.frame(title.surv.tbl), aes(x = Title, y = Freq*100, fill=Survived)) +
  geom_bar(width = 0.4, stat = 'identity', position = 'dodge') +
  xlab("Title") +
  ylab("Percentage") +
  ylim(c(0,100)) +
  labs(fill = "Survived") +
  theme_bw()
# as expected, simlar to AgeGroup

# Let's also see how economic status (approximated via passenger class) 
# affects this distribution
ggplot(merged.data[1:891, ], aes(x = AgeGroup, fill = Survived)) +
  facet_wrap(~Pclass) +
  geom_bar(width = 0.4, position = 'dodge') +
  xlab("Age.gender group") +
  ylab("Total Count") +
  labs(fill = "Survived") +
  ggtitle("Passanger class") +
  theme_bw()
# obviously the economic status played a significant role

# we'll introduce an additional feature related to the number of family members 
# one is travelling with
# to that end, we'll simply add SibSp (the number of siblings and spouses) and 
# Parch (the number of parents and children the passengers had with them)
merged.data$FamilySize <- merged.data$SibSp + merged.data$Parch
summary( merged.data$FamilySize )
table( merged.data$FamilySize )
# obviously, majority of people were travelling alone
# it can be also observed that those who travelled with 3+ family members were not that numerous
nrow(merged.data[merged.data$FamilySize>=3,])
# 125 (out of 1309) passangers, ~10%
# so, we can aggregate instances with 3+ family members:
merged.data$FamilySize[ merged.data$FamilySize > 3 ] <- 3
# and turn FamilySize into a factor
merged.data$FamilySize <- factor(x = merged.data$FamilySize, levels = c(0,1,2,3), 
                                 labels = c("Alone", "1", "2", "3+"))
table( merged.data$FamilySize )

# Let's see how this new feature affects the survival prospects
ggplot(merged.data[1:891, ], aes(x = FamilySize, fill = Survived)) +
  geom_bar(width = 0.4, position = 'dodge') +
  xlab("Family size") +
  ylab("Total Count") +
  labs(fill = "Survived") +
  theme_bw()

# let's save the dataset that we've crated so far
write.csv(x = merged.data, file = "data/merged_data_prepared.csv", row.names = F)
saveRDS(object = merged.data, file = "data/merged_data_prepared.RData")
