# install.packages("ggplot2")
library(ggplot2)
Read the dataset.
An engineered and highly simplified version: “The Beatles songs dataset, v1, no NAs.csv”:
the.beatles.songs <- read.csv("The Beatles songs dataset, v1, no NAs.csv",
stringsAsFactors = FALSE)
summary(the.beatles.songs)
## Title Year Album.debut Duration
## Length:310 Min. :1958 Length:310 Min. : 23.0
## Class :character 1st Qu.:1963 Class :character 1st Qu.:133.0
## Mode :character Median :1965 Mode :character Median :150.0
## Mean :1965 Mean :159.6
## 3rd Qu.:1968 3rd Qu.:172.8
## Max. :1980 Max. :502.0
## Other.releases Genre Songwriter Lead.vocal
## Min. : 0.00 Length:310 Length:310 Length:310
## 1st Qu.: 0.00 Class :character Class :character Class :character
## Median : 9.00 Mode :character Mode :character Mode :character
## Mean :10.42
## 3rd Qu.:16.00
## Max. :56.00
## Top.50.Billboard
## Min. :-1.000
## 1st Qu.:-1.000
## Median :-1.000
## Mean : 3.158
## 3rd Qu.:-1.000
## Max. :50.000
A more realistic, but still very simple version:
the.beatles.songs <- read.csv("The Beatles songs dataset, v3.csv",
stringsAsFactors = FALSE)
summary(the.beatles.songs)
## Title Year Album.debut Duration
## Length:310 Length:310 Length:310 Min. : 23.0
## Class :character Class :character Class :character 1st Qu.:130.0
## Mode :character Mode :character Mode :character Median :149.0
## Mean :160.6
## 3rd Qu.:176.0
## Max. :502.0
## NA's :29
## Other.releases Single.A.side Single.B.side
## Min. : 2.00 Length:310 Length:310
## 1st Qu.: 8.00 Class :character Class :character
## Median :12.50 Mode :character Mode :character
## Mean :14.96
## 3rd Qu.:19.25
## Max. :56.00
## NA's :94
## Single.certification Genre Styles
## Length:310 Length:310 Length:310
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Themes Moods Songwriter
## Length:310 Length:310 Length:310
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Lead.vocal Cover Covered.by
## Length:310 Length:310 Min. : 1.00
## Class :character Class :character 1st Qu.: 3.00
## Mode :character Mode :character Median : 7.00
## Mean :11.50
## 3rd Qu.:15.75
## Max. :70.00
## NA's :128
## Chart.position.UK.Wikipedia Chart.position.US.Wikipedia
## Min. : 1.000 Min. : 1.00
## 1st Qu.: 1.000 1st Qu.: 1.00
## Median : 1.000 Median : 10.50
## Mean : 9.957 Mean : 25.81
## 3rd Qu.: 4.000 3rd Qu.: 46.25
## Max. :90.000 Max. :102.00
## NA's :264 NA's :242
## Highest.position.The.Guardian Weeks.on.chart.in.UK.The.Guardian
## Min. : 1.00 Min. : 1.0
## 1st Qu.: 1.00 1st Qu.: 7.5
## Median : 1.00 Median :12.0
## Mean : 8.29 Mean :12.1
## 3rd Qu.: 5.50 3rd Qu.:14.5
## Max. :63.00 Max. :33.0
## NA's :279 NA's :279
## Weeks.at.No1.in.UK.The.Guardian Highest.position.Billboard
## Min. :2.000 Min. : 1.000
## 1st Qu.:3.000 1st Qu.: 1.000
## Median :3.500 Median : 3.000
## Mean :4.125 Mean : 8.878
## 3rd Qu.:5.250 3rd Qu.:12.000
## Max. :7.000 Max. :47.000
## NA's :294 NA's :261
## Weeks.at.No1.Billboard Top.50.Billboard Top.50.Ultimate.classic.rock
## Min. :2.0 Min. : 1.00 Min. : 1.00
## 1st Qu.:2.0 1st Qu.:13.00 1st Qu.:14.00
## Median :3.0 Median :25.00 Median :26.00
## Mean :3.6 Mean :25.31 Mean :25.96
## 3rd Qu.:4.5 3rd Qu.:38.00 3rd Qu.:38.00
## Max. :9.0 Max. :50.00 Max. :50.00
## NA's :295 NA's :261 NA's :261
## Top.50.Rolling.Stone Top.50.NME Top.50.Top50songs.org
## Min. : 1.00 Min. : 1.00 Min. : 1.00
## 1st Qu.:13.00 1st Qu.:13.00 1st Qu.:13.25
## Median :26.00 Median :26.00 Median :25.50
## Mean :25.55 Mean :25.69 Mean :25.50
## 3rd Qu.:38.00 3rd Qu.:38.00 3rd Qu.:37.75
## Max. :50.00 Max. :50.00 Max. :50.00
## NA's :261 NA's :261 NA's :260
## Top.50.USA.today.2017 Top.50.Vulture.by.Bill.Wyman
## Min. : 1.00 Min. : 1.00
## 1st Qu.:13.25 1st Qu.:11.25
## Median :25.50 Median :25.50
## Mean :25.50 Mean :25.50
## 3rd Qu.:37.75 3rd Qu.:39.75
## Max. :50.00 Max. :50.00
## NA's :260 NA's :268
NAs are missing values; empty strings (“”) are not the same as NAs.
Different ways of checking if there are NAs and/or “”s:
summary(<dataframe>)
which(complete.cases(<dataframe>) == FALSE) # which rows contain NAs
length(which(complete.cases(<dataframe>) == FALSE)) # how many such rows
which(is.na(<dataframe>[<row>, ])) # NAs in <row>
which(is.na(<dataframe>$<column>)) # NAs in <column>
which(<dataframe>$<column> == "") # ""s in <column>
nrow(<dataframe>[<dataframe>$<column> == "", ]) # how many such rows
summary(the.beatles.songs)
## Title Year Album.debut Duration
## Length:310 Length:310 Length:310 Min. : 23.0
## Class :character Class :character Class :character 1st Qu.:130.0
## Mode :character Mode :character Mode :character Median :149.0
## Mean :160.6
## 3rd Qu.:176.0
## Max. :502.0
## NA's :29
## Other.releases Single.A.side Single.B.side
## Min. : 2.00 Length:310 Length:310
## 1st Qu.: 8.00 Class :character Class :character
## Median :12.50 Mode :character Mode :character
## Mean :14.96
## 3rd Qu.:19.25
## Max. :56.00
## NA's :94
## Single.certification Genre Styles
## Length:310 Length:310 Length:310
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Themes Moods Songwriter
## Length:310 Length:310 Length:310
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Lead.vocal Cover Covered.by
## Length:310 Length:310 Min. : 1.00
## Class :character Class :character 1st Qu.: 3.00
## Mode :character Mode :character Median : 7.00
## Mean :11.50
## 3rd Qu.:15.75
## Max. :70.00
## NA's :128
## Chart.position.UK.Wikipedia Chart.position.US.Wikipedia
## Min. : 1.000 Min. : 1.00
## 1st Qu.: 1.000 1st Qu.: 1.00
## Median : 1.000 Median : 10.50
## Mean : 9.957 Mean : 25.81
## 3rd Qu.: 4.000 3rd Qu.: 46.25
## Max. :90.000 Max. :102.00
## NA's :264 NA's :242
## Highest.position.The.Guardian Weeks.on.chart.in.UK.The.Guardian
## Min. : 1.00 Min. : 1.0
## 1st Qu.: 1.00 1st Qu.: 7.5
## Median : 1.00 Median :12.0
## Mean : 8.29 Mean :12.1
## 3rd Qu.: 5.50 3rd Qu.:14.5
## Max. :63.00 Max. :33.0
## NA's :279 NA's :279
## Weeks.at.No1.in.UK.The.Guardian Highest.position.Billboard
## Min. :2.000 Min. : 1.000
## 1st Qu.:3.000 1st Qu.: 1.000
## Median :3.500 Median : 3.000
## Mean :4.125 Mean : 8.878
## 3rd Qu.:5.250 3rd Qu.:12.000
## Max. :7.000 Max. :47.000
## NA's :294 NA's :261
## Weeks.at.No1.Billboard Top.50.Billboard Top.50.Ultimate.classic.rock
## Min. :2.0 Min. : 1.00 Min. : 1.00
## 1st Qu.:2.0 1st Qu.:13.00 1st Qu.:14.00
## Median :3.0 Median :25.00 Median :26.00
## Mean :3.6 Mean :25.31 Mean :25.96
## 3rd Qu.:4.5 3rd Qu.:38.00 3rd Qu.:38.00
## Max. :9.0 Max. :50.00 Max. :50.00
## NA's :295 NA's :261 NA's :261
## Top.50.Rolling.Stone Top.50.NME Top.50.Top50songs.org
## Min. : 1.00 Min. : 1.00 Min. : 1.00
## 1st Qu.:13.00 1st Qu.:13.00 1st Qu.:13.25
## Median :26.00 Median :26.00 Median :25.50
## Mean :25.55 Mean :25.69 Mean :25.50
## 3rd Qu.:38.00 3rd Qu.:38.00 3rd Qu.:37.75
## Max. :50.00 Max. :50.00 Max. :50.00
## NA's :261 NA's :261 NA's :260
## Top.50.USA.today.2017 Top.50.Vulture.by.Bill.Wyman
## Min. : 1.00 Min. : 1.00
## 1st Qu.:13.25 1st Qu.:11.25
## Median :25.50 Median :25.50
## Mean :25.50 Mean :25.50
## 3rd Qu.:37.75 3rd Qu.:39.75
## Max. :50.00 Max. :50.00
## NA's :260 NA's :268
which(complete.cases(the.beatles.songs) == FALSE)
## [1] 1 2 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## [18] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
## [35] 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
## [52] 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
## [69] 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
## [86] 87 88 89 90 91 92 94 95 96 97 98 100 101 102 103 104 105
## [103] 106 107 108 109 110 111 112 113 114 115 116 117 118 120 121 122 123
## [120] 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## [137] 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
## [154] 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
## [171] 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
## [188] 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
## [205] 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
## [222] 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
## [239] 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
## [256] 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
## [273] 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
## [290] 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
which(is.na(the.beatles.songs$Top.50.Billboard))
## [1] 1 2 4 5 6 9 10 11 12 15 16 17 18 19 20 21 23
## [18] 24 25 26 27 28 29 30 31 32 33 34 36 37 38 39 40 41
## [35] 42 43 44 45 47 48 49 51 52 53 54 55 57 58 59 60 61
## [52] 62 65 66 67 68 69 70 71 72 73 75 76 78 79 80 81 82
## [69] 83 84 85 86 88 89 90 91 94 95 96 97 98 100 101 102 103
## [86] 104 105 106 109 110 111 112 113 114 115 117 118 120 121 122 123 124
## [103] 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
## [120] 143 144 145 146 147 148 149 150 151 152 153 154 155 156 158 159 161
## [137] 162 163 164 165 166 168 169 170 171 172 173 174 175 176 177 179 180
## [154] 181 182 183 184 185 186 187 189 190 191 192 193 195 196 197 198 199
## [171] 200 201 202 206 207 209 212 213 215 216 217 218 219 220 221 222 223
## [188] 224 225 226 227 229 231 232 233 235 236 237 239 240 242 243 244 245
## [205] 246 247 248 249 250 252 253 254 256 257 258 259 260 262 263 264 265
## [222] 266 267 268 269 270 272 273 274 275 276 278 279 280 282 283 284 285
## [239] 286 287 288 289 290 291 292 293 294 296 297 299 300 301 302 303 304
## [256] 305 306 307 308 309 310
which(the.beatles.songs$Single.A.side == "")
## [1] 1 4 5 6 8 9 11 15 16 17 18 20 24 25 26 27 28
## [18] 29 30 31 32 34 36 37 38 39 40 41 43 44 45 47 48 49
## [35] 51 52 53 54 55 57 58 59 61 62 65 66 67 68 69 70 71
## [52] 72 76 78 80 81 82 83 84 85 86 88 89 90 91 95 96 97
## [69] 100 101 102 103 104 106 109 110 111 112 113 114 115 118 120 121 122
## [86] 123 124 128 131 133 134 135 136 137 138 139 140 142 143 144 145 146
## [103] 147 148 149 150 151 153 155 158 159 161 162 163 164 165 166 168 169
## [120] 170 171 172 173 174 175 176 177 179 180 181 184 185 186 187 190 191
## [137] 192 193 196 197 199 200 201 202 206 209 212 213 215 217 219 220 221
## [154] 222 223 224 226 227 229 231 232 235 236 237 239 240 242 243 244 245
## [171] 246 247 248 249 252 253 254 256 257 258 259 262 264 265 266 268 270
## [188] 272 273 274 275 276 278 279 280 283 284 285 286 288 289 290 292 293
## [205] 294 296 301 302 303 304 305 306 307 308 309 310
nrow(the.beatles.songs[(the.beatles.songs$Single.A.side == "") & (the.beatles.songs$Year == 1967), ])
## [1] 17
Use the Amelia package:
library(Amelia) # visualizes NAs, BUT NOT ""s !!!
+ # to visualize ""s with Amelia as well, replace ""s with NAs
par(mfrow=c(1,2)) # structure the display area to show two plots in the same row
missmap(obj = <dataframe>,
+ main = "<title>",
+ legend = FALSE)
par(mfrow=c(1,1)) # revert the plotting area to the default (one plot per row)
Amelia is not absolutely necessary for small-scale problems, since summary(dataframe) clearly shows NAs as well (and so do which(complete.cases(<dataframe>) == FALSE))
and which(is.na(<dataframe>$<column>)))
.
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2017 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
par(mfrow=c(1,2))
missmap(obj = the.beatles.songs, main = "The Beatles songs dataset NAs (1/2)", legend = FALSE)
missmap(obj = the.beatles.songs[, c(-3:-15)], main = "The Beatles songs dataset NAs (2/2)", legend = FALSE)
par(mfrow=c(1,1))
In a situation like this, the missing values are replaced by the ‘majority class’ (the dominant value).
unique(<dataframe>$<column>) # how many different values
xtabs(~<column>, data = <dataframe>) # show frequencies, but not for NAs
table(<dataframe>$<column>) # show frequencies, but not for NAs
table(<dataframe>$<column>, useNA = "ifany") # show frequencies for NAs as well
unique(the.beatles.songs$Year) # how many different values of Year
## [1] "1965" "1967" "1964" "1963" "1968"
## [6] "1961" "1969" "1966" "1962" "1960"
## [11] "196?" "1977/1994" "1970" "1958" "1980/1995"
which(the.beatles.songs$Year == "196?") # turn this one into NA
## [1] 69
the.beatles.songs$Year[69] <- NA
the.beatles.songs$Year <- as.factor(the.beatles.songs$Year) # represent Year as a factor
table(the.beatles.songs$Year, useNA = "ifany") # show frequencies, including NA
##
## 1958 1960 1961 1962 1963 1964 1965
## 2 4 3 20 66 41 37
## 1966 1967 1968 1969 1970 1977/1994 1980/1995
## 19 27 45 42 1 1 1
## <NA>
## 1
max(as.integer(table(the.beatles.songs$Year))) # find the 'majority class'
## [1] 66
the.beatles.songs$Year[69] <- "1963" # replace the NA
xtabs(~Year, the.beatles.songs) # verify the replacement
## Year
## 1958 1960 1961 1962 1963 1964 1965
## 2 4 3 20 67 41 37
## 1966 1967 1968 1969 1970 1977/1994 1980/1995
## 19 27 45 42 1 1 1
saveRDS(the.beatles.songs, "The Beatles songs dataset, v5.1.RData") # save this version for later use
# the.beatles.songs <- readRDS("The Beatles songs dataset, v5.1.RData")
Replace the missing values with the average value of the variable on a subset of instances that are the closest (the most similar) to the instance(s) with the missing value. If the variable is normaly distributed (shapiro.test()
), use the mean; otherwise, used the median. In the simplest case, use the entire range of instances.
summary(<dataframe>$<numeric column>) # inspect normality
plot(density((<dataframe>$<numeric column>), na.rm = TRUE) # inspect normality
shapiro.test(<dataframe>$<numeric column>) # inspect normality
<indices> <- which(is.na(<dataframe>$<numeric column>)) # get the indices of NAs in <dataframe>$<numeric column>
<dataframe>$<numeric column>[<indices>] <- # in case of normal distribution
+ mean(<dataframe>$<numeric column>, na.rm = TRUE)
<dataframe>$<numeric column>[<indices>] <- # in other cases
+ median(<dataframe>$<numeric column>, na.rm = TRUE)
summary(the.beatles.songs$Duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 23.0 130.0 149.0 160.6 176.0 502.0 29
plot(density(the.beatles.songs$Duration, na.rm = TRUE))
shapiro.test(the.beatles.songs$Duration)
##
## Shapiro-Wilk normality test
##
## data: the.beatles.songs$Duration
## W = 0.81262, p-value < 2.2e-16
original.Duration <- the.beatles.songs$Duration # save it for the other examples
indices <- which(is.na(the.beatles.songs$Duration))
the.beatles.songs$Duration[indices] <- # distribution is not normal,
as.integer(summary(the.beatles.songs$Duration)[3]) # so use the median
Use linear regression to PREDICT the missing values, and replace the missing values with the predicted ones:
<indices of missing values> <-
+ which(is.na(<dataframe>$<numeric column>))
length(<indices of missing values>) # verify that the number is small
# install.packages("rpart")
library(rpart)
<regression tree> <-
+ rpart(<output variable> ~ # build the regression tree (the model)
+ <predictor variable 1> +
+ <predictor variable 2> + ..., # . to include all variables
+ data = # the entire dataframe, but
+ <dataframe>[-<indices of missing values>], # leaving out the rows with NAs
+ method = "anova") # build regression tree
# install.packages('rattle')
# install.packages('rpart.plot')
# install.packages('RColorBrewer')
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(<regression tree>) # plot the regression tree
<predicted values for NAs> <- # make predictions
+ predict(object = <regression tree>,
+ newdata = <dataframe>[<indices of missing values>, ])
<original <dataframe>$<numeric column> as a dataframe> <- # save it for later assessment
+ data.frame(<numeric column name> = <dataframe>$<numeric column>)
<original <dataframe>$<numeric column> as a dataframe>$lbl <- "before"
ggplot(<original <dataframe>$<numeric column> as a dataframe>, # optional: plot the density of
+ aes(x = <numeric column>)) + # the original <numeric column>
+ geom_density()
<dataframe>$<numeric column>[<indices of missing values>] <- # impute predicted values
+ as.integer(<predicted values for NAs>)
summary(<dataframe>$<numeric column>) # verify that NAs are eliminated
summary(<original <dataframe>$<numeric column> as a dataframe>$<numeric column>)
<modified <dataframe>$<numeric column> as a dataframe> <-
+ data.frame(<numeric column name> = <dataframe>$<numeric column>)
<modified <dataframe>$<numeric column> as a dataframe>$lbl <- "after"
<before-and-after dataframe> <- # append 2 new dataframes:
+ rbind(<original <dataframe>$<numeric column> as a dataframe>, # before and
+ <modified <dataframe>$<numeric column> as a dataframe>) # after the imputation
library(ggplot2)
ggplot(<before-and-after dataframe>, # plot the results
+ aes(x = <numeric column name>, fill = lbl)) +
+ geom_density(alpha = 0.2) + # alpha: plot transparency (0-1, optional)
+ theme_bw()
the.beatles.songs$Duration <- original.Duration # restore the original Duration (with NAs)
indices <- which(is.na(the.beatles.songs$Duration))
library(rpart)
song.duration.model <- rpart(Duration ~ Year,
data = the.beatles.songs,
method = "anova")
library(rattle)
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(song.duration.model) # plot the regression tree
song.duration.predicted <-
predict(object = song.duration.model,
newdata = the.beatles.songs[indices, ])
original.Duration.df <- data.frame(Duration = the.beatles.songs$Duration)
original.Duration.df$lbl <- "before"
# ggplot(original.Duration.df, aes(x = Duration)) + geom_density()
the.beatles.songs$Duration[indices] <- song.duration.predicted
summary(the.beatles.songs$Duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.0 133.0 148.0 159.9 174.0 502.0
summary(original.Duration.df$Duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 23.0 130.0 149.0 160.6 176.0 502.0 29
modified.Duration.df <- data.frame(Duration = the.beatles.songs$Duration)
modified.Duration.df$lbl <- "after"
duration.df <- rbind(original.Duration.df, modified.Duration.df)
library(ggplot2)
ggplot(duration.df, aes(x = Duration, fill = lbl)) +
geom_density(alpha = 0.3) +
theme_bw()
Use non-NA values from a subset of SIMILAR observations (similar in terms of having (nearly) the same values of other relevant features). Suitable when the number of NAs is very small, since the replacements typically go one by one. The example in the YouTube video Introduction to Data Science with R - Exploratory Modeling 2, from 1:24:38 to 1:28:04, l. 805-815, where one is looking at similar records in the dataset, in order to find a suitable replacement for the missing value. This approach has not been demonstrated here because no suitable feature is available in the dataset (a numeric feature with a very small number of NAs), but the approach is worth mentioning.
A more sophisticated imputation is applied in such cases (out of scope of this course). It is, in fact, the task of predicting (good substitutes for) the missing values. The other option is to create some new variables (“proxies”) and do some feature engineering.
Get rid of all other NAs and factorize suitable features before attempting feature selection and engineering:
source("Get rid of NAs.R")
the.beatles.songs <- getRidOfNAs(the.beatles.songs)
saveRDS(the.beatles.songs, "The Beatles songs dataset, v5.2.RData")
the.beatles.songs <- factorize(the.beatles.songs)
saveRDS(the.beatles.songs, "The Beatles songs dataset, v5.3.RData")
Overall philosophy: pick features one by one, or in suitable pairs, and see how they can be engineered to increase the quality/precision of predictions. Generally, plot variables and their values (with na.rm = TRUE) whenever it makes sense. It gives you a better sense of the predictive power of each variable.
Split the dataset into train and test sets:
# install.packages("caret")
library(caret)
set.seed(<n>)
<train dataset indices> <- # stratified partitioning:
+ createDataPartition(<dataset>$<output variable>, # the same distribution of the output variable in both sets
+ p = .80, # 80/20% of data in train/test sets
+ list = FALSE) # don't make a list of results, make a matrix
<train dataset> <- <dataset>[<train dataset indices>, ]
<test dataset> <- <dataset>[-<train dataset indices>, ]
library(caret)
set.seed(444)
train.data.indices <- createDataPartition(the.beatles.songs$Top.50.Billboard, p = 0.80, list = FALSE)
train.data <- the.beatles.songs[train.data.indices, ]
test.data <- the.beatles.songs[-train.data.indices, ]
Examine the predictive power of variables from the data set by means of tables, frequencies and proportions. For a categorical predictor, check its frequencies and proportions in the dataset: summary(<dataset>$<predictor>) # frequencies
round(summary(<dataset>$<predictor>) / nrow(<dataset>), digits = 3) # proportions
summary(train.data$Single.certification)
## No RIAA 2xPlatinum RIAA 4xPlatinum
## 208 4 1
## RIAA Gold RIAA Gold, BPI Silver RIAA Platinum
## 27 1 8
round(summary(train.data$Single.certification) / nrow(train.data), digits = 3)
## No RIAA 2xPlatinum RIAA 4xPlatinum
## 0.835 0.016 0.004
## RIAA Gold RIAA Gold, BPI Silver RIAA Platinum
## 0.108 0.004 0.032
Then examine the frequencies and the proportions of the output variable values (classes) based on the predictor values:
xtabs(~<predictor> + <output variable>, data = <dataset>) # frequencies
prop.table(xtabs(~<predictor> + <output variable>, data = <dataset>), # proportions
+ margin = 1) # by row
xtabs(~Single.certification + Top.50.Billboard, data = train.data)
## Top.50.Billboard
## Single.certification No Yes
## No 200 8
## RIAA 2xPlatinum 0 4
## RIAA 4xPlatinum 0 1
## RIAA Gold 7 20
## RIAA Gold, BPI Silver 0 1
## RIAA Platinum 2 6
prop.table(xtabs(~Single.certification + Top.50.Billboard, data = train.data), margin = 1)
## Top.50.Billboard
## Single.certification No Yes
## No 0.96153846 0.03846154
## RIAA 2xPlatinum 0.00000000 1.00000000
## RIAA 4xPlatinum 0.00000000 1.00000000
## RIAA Gold 0.25925926 0.74074074
## RIAA Gold, BPI Silver 0.00000000 1.00000000
## RIAA Platinum 0.25000000 0.75000000
For a numeric predictor with a small number of values, first convert it into a factor:
<dataset>$<predictor> <- factor(<dataset>$<predictor>,
+ levels = c(<n1>, <n2>, ...),
+ labels = c("<l1>", "<l2>", ...))
However, make sure to keep the original (numeric) values from the integrated dataset (including both and test datasets) for possible later use:
<original numeric predictor> <- <integrated dataset>$<predictor>
unique(train.data$Weeks.at.No1.in.UK.The.Guardian)
## [1] 0 3 5 7 6 2 4
original.Weeks.at.No1.in.UK.The.Guardian <- # keep it for later
the.beatles.songs$Weeks.at.No1.in.UK.The.Guardian
train.data$Weeks.at.No1.in.UK.The.Guardian <-
factor(train.data$Weeks.at.No1.in.UK.The.Guardian,
levels = c(0, 2, 3, 4, 5, 6, 7),
labels = c("0", "2", "3", "4", "5", "6", "7"))
Then examine frequencies and proportions as above, or plot the output variable against the predictor:
<gg> <- ggplot(<dataset>, aes(x = <predictor name>, fill = <output variable name>)) +
+ geom_bar(position = "dodge", width = <bin width>) + # "dodge": bargraph, <bin width>: 0.2-0.4
+ labs(x = "<x-label>", y = "<y-label>", title = "<title>") +
+ theme_bw()
<gg>
<gf> <- <gg> + facet_wrap(~<another predictor name>) # examine 2 predictors together
<gf>
gg1 <- ggplot(data = train.data, aes(x = Weeks.at.No1.in.UK.The.Guardian, fill = Top.50.Billboard)) +
geom_bar(position = "dodge", width = 0.6) +
ylab("Number of Billboard Top 50 songs") + xlab("Weeks and No.1 in UK (The Guardian)") +
theme_bw()
gg1
gg2 <- ggplot(data = train.data, aes(x = Year, fill = Top.50.Billboard)) +
geom_bar(position = "dodge", width = 0.6) +
ylab("Number of Billboard Top 50 songs") + xlab("Year") +
theme_bw()
gg2
gg3 <- ggplot(data = train.data, aes(x = Single.certification, fill = Top.50.Billboard)) +
geom_bar(position = "dodge", width = 0.6) +
labs(y = "Number of Billboard Top 50 songs", x = "Single certification", title = "Certified") +
theme_bw()
gg3
gg4 <- gg3 + facet_wrap(~Year)
gg4
When creating new features (attributes) to be used for prediction purposes, we should merge the training and the test sets and develop new features on the merged data. Before that, we need to assure that the training and the test sets have exactly the same structure.
In the test set, do all the modifications that have been done in the train set:
unique(test.data$Weeks.at.No1.in.UK.The.Guardian)
## [1] 0 3 2
test.data$Weeks.at.No1.in.UK.The.Guardian <-
factor(test.data$Weeks.at.No1.in.UK.The.Guardian,
levels = c(0, 2, 3, 4, 5, 6, 7),
labels = c("0", "2", "3", "4", "5", "6", "7"))
Merge the train and test datasets into one for creating new features:
<adapted dataset> <- rbind(<adapted train dataset>, <adapted test dataset>)
saveRDS(<adapted dataset>, "<RData filename>")
the.beatles.songs <- rbind(train.data, test.data)
saveRDS(the.beatles.songs, "The Beatles songs dataset, v5.4.RData")
It’s a new variable that approximates an original one, or is a good replacement for the original one.
How many Top 50 Billboard songs performed by the Beatles are covers of other authors’ songs?
which(the.beatles.songs$Cover == "Yes") # how many cover songs (all)
## [1] 1 3 4 10 13 16 17 23 24 26 30 33 38 41 44 48 55
## [18] 64 71 82 85 89 90 91 92 109 115 116 127 129 130 133 137 138
## [35] 145 147 151 153 156 161 167 172 179 180 183 192 194 195 200 201 207
## [52] 211 216 222 224 226 227 239 248 249 253 268 275 278 286 288 290 291
## [69] 293 296 299
top.50.bb.indices <- which(the.beatles.songs$Top.50.Billboard == "Yes") # all Billboard Top 50 songs
top.50.bb.indices
## [1] 3 4 8 9 25 35 39 45 50 51 59 62 70 75 76 88 95
## [18] 98 104 132 134 141 157 168 169 170 173 175 176 178 188 190 196 198
## [35] 206 213 221 227 240 243 252 257 267 269 278 282 290 297 301
which(the.beatles.songs$Cover[top.50.bb.indices] == "Yes") # how many cover songs on Billboard Top 50
## [1] 1 2 38 45 47
Very few Top 50 Billboard songs performed by the Beatles are covers of other authors’ songs, so take a closer look at the Songwriter feature. How many different Songwriter values are there?
unique(the.beatles.songs$Songwriter)
## [1] "Thompson"
## [2] "Lennon"
## [3] "Russell, Morrison"
## [4] "Yellen, Ager"
## [5] "McCartney"
## [6] "McCartney, with Lennon"
## [7] "Alexander"
## [8] "Lennon, with McCartney"
## [9] "Bacharach, David, Dixon"
## [10] "Lennon and McCartney"
## [11] "Williams"
## [12] "Foster"
## [13] "Harrison"
## [14] "Dixon, Farrell"
## [15] "Velázquez, Skylar"
## [16] "Berry"
## [17] "Goffin, King"
## [18] "Pingatore"
## [19] "Lennon and Harrison"
## [20] "Buddy Holly"
## [21] "Drapkin (aka Ricky Dee)"
## [22] "Lennon, with McCartney, Harrison and Starkey"
## [23] "Starkey"
## [24] "Perkins"
## [25] "?"
## [26] "Bennett, Tepper, Schroeder"
## [27] "Charles"
## [28] "Romero"
## [29] "Murray"
## [30] "Stan Kesler and Charlie Feathers"
## [31] "Wilkin, Westberry"
## [32] "Thomas, Biggs"
## [33] "McCartney and Harrison"
## [34] "McCartney, Starkey"
## [35] "Lennon, McCartney, Harrison, Starkey"
## [36] "Leiber, Stoller/Penniman"
## [37] "Lennon/McCartney/Nicolas/Hellmer"
## [38] "Turner, McDougall"
## [39] "J. Burnette, D. Burnette, Burlison, Mortimer"
## [40] "Blackwell, Johnson, Penniman"
## [41] "Penniman, Collins"
## [42] "Lennon with McCartney"
## [43] "Traditional, arr. Lennon, McCartney, Harrison, Starkey"
## [44] "Row, Katz, Roberts, Clayton"
## [45] "Gordy, Bradford"
## [46] "Percy Wenrich, Edward Madden"
## [47] "Johnson"
## [48] "Traditional, arr. Sheridan"
## [49] "Fontaine"
## [50] "Starkey, with uncredited assistance from Harrison"
## [51] "Penniman"
## [52] "Dobbins, Garrett, Holland, Bateman, Gorman"
## [53] "Lennon, with McCartney, Harrison, Starkey"
## [54] "Leiber, Stoller"
## [55] "Isley, Isley, Isley"
## [56] "Lennon and McCartney/Nicolas/Montague"
## [57] "Cason, Moon"
## [58] "Leiber, Stoller, Barrett"
## [59] "Harrison, with uncredited contribution from Lennon"
## [60] "Crudup"
## [61] "Theodorakis, Sansom"
## [62] "Smith, Wheeler-Snyder"
## [63] "Willson"
## [64] "Lennon and/or McCartney"
## [65] "Spector"
## [66] "Medley, Russell"
## [67] "Lennon and McCartney, with Starkey"
## [68] "McCartney (as Bernard Webb or A. Smith)"
## [69] "Holly"
## [70] "Robinson"
## [71] "Lennon, McCartney, Harrison and Starkey"
## [72] "Scott, Marlow"
## [73] "Lennon, McCartney"
## [74] "Twomey, Wise, Weisman"
## [75] "Perkins, Jefferson"
## [76] "Lennon, with Ono and Harrison"
## [77] "Blackwell-Marascalco (\"Rip It Up\"), Calhoun (\"Shake, Rattle, and Roll\"), Perkins (\"Blue Suede Shoes\")"
## [78] "Al Dubin, Harry Warren"
## [79] "Bryant, Bryant"
## [80] "McCartney (Step Inside Love); Lennon, McCartney, Harrison and Starkey (Los Paranoias)"
## [81] "Perkins, Claunch, Cantrell"
## [82] "Allison, Holly, Petty"
How many songs are covers?
length(which(the.beatles.songs$Cover == "Yes"))
## [1] 71
It’s a considerable difference (82 vs. 71), so it’s better to create a proxy for song authorship.
How many songs have, say, John Lennon in the list of authors?
grepl(<substring>, <string>) # TRUE if <string> contains <substring>
<indices> <- # indices of <character variable> containing <substring>
+ grep(<substring>,
+ <dataframe>$<character variable>)
grepl("eat", "The Beatles")
## [1] TRUE
i.lennon <- grep("Lennon", the.beatles.songs$Songwriter)
i.mccartney <- grep("McCartney", the.beatles.songs$Songwriter)
i.harrison <- grep("Harrison", the.beatles.songs$Songwriter)
i.starkey <- grep("Starkey", the.beatles.songs$Songwriter)
source("Song author proxies.R")
authors <- getSongAuthorProxies(i.lennon, i.mccartney, i.harrison, i.starkey)
lennon.songs <- authors[[1]]
mccartney.songs <- authors[[2]]
harrison.songs <- authors[[3]]
starkey.songs <- authors[[4]]
lennon.mccartney.songs <- authors[[5]]
mccartney.lennon.songs <- authors[[6]]
lennon.mccartney.harrison.starkey.songs <- authors[[7]]
Create proxy variables for different authors:
the.beatles.songs$Author <- "Other"
the.beatles.songs$Author[lennon.songs] <- "Lennon"
the.beatles.songs$Author[mccartney.songs] <- "McCartney"
the.beatles.songs$Author[harrison.songs] <- "Harrison"
the.beatles.songs$Author[starkey.songs] <- "Starkey"
the.beatles.songs$Author[lennon.mccartney.songs] <- "Lennon/McCartney"
the.beatles.songs$Author[mccartney.lennon.songs] <- "McCartney/Lennon"
the.beatles.songs$Author[lennon.mccartney.harrison.starkey.songs] <- "Lennon/McCartney/Harrison/Starkey"
Convert the new proxy vriable into a factor:
the.beatles.songs$Author <- factor(the.beatles.songs$Author)
summary(the.beatles.songs$Author)
## Harrison Lennon
## 26 65
## Lennon/McCartney Lennon/McCartney/Harrison/Starkey
## 36 9
## McCartney McCartney/Lennon
## 68 22
## Other Starkey
## 82 2
Examine the predictive power of the newly created proxy variable:
ggplot(data = <dataset>[1:<training data length>, ], # only the training data from the merged dataset!
+ aes(x = <proxy variable name>,
+ fill = <output variable name>)) +
+ geom_bar(position = "dodge") + # bar graph
+ theme_bw()
ggplot(the.beatles.songs[1:249, ],
aes(x = Author, fill = Top.50.Billboard)) +
geom_bar(position = "dodge") +
theme_bw()
Ideas about Chart.position.UK.Wikipedia, Chart.position.US.Wikipedia, Highest.position.The.Guardian, Weeks.on.chart.in.UK.The.Guardian and Weeks.at.No1.The.Guardian:
# Normalized chart position
+ if (<chart position> == 0) {
+ <normalized chart position> <- 0
+ }
+ if (<chart position> in (1:<max value>)) {
+ <normalized chart position> <- (<max value> - <chart position> + 1) / <max value>
+ }
# Normalized weeks-on-chart
+ <normalized weeks-on-chart> <- <weeks-on-chart> / <max value>
# Normalized weeks-at-No.1
<normalized weeks-at-No.1> <- <weeks-at-No.1> / <max value>
# Idea: chart presence
# Chart presence in UK/US:
+ <chart presence> <-
+ (<a> * <normalized chart position> +
+ <b> * <normalized weeks-on-chart> +
+ <c> * normalized weeks-at-no-1) /
+ (<a> + <b> + <c>)
# Chart presence in both UK and US (overall): ((a * ch.pr.1 + b * ch.pr.2) / (a + b))
+ <chart presence> <-
+ (<a> * <chart presence in UK> +
+ <b> * <chart presence in US>) /
+ (<a> + <b>)
New variables: Chart.presence.UK, Chart.presence.US, Chart.presence.UK.and.US
the.beatles.songs$Weeks.at.No1.in.UK.The.Guardian <- # restore this as a numeric
original.Weeks.at.No1.in.UK.The.Guardian
saveRDS(the.beatles.songs, "The Beatles songs dataset, v5.5.RData")
source("Chart presence.R")
ch.pos.UK.norm <- getNormalizedChartPosition(the.beatles.songs$Chart.position.UK.Wikipedia)
weeks.on.chart.UK.norm <-
getNormalizedWeeksOnChart(the.beatles.songs$Weeks.on.chart.in.UK.The.Guardian)
weeks.at.No1.UK.norm <-
getNormalizedWeeksAtNo1(the.beatles.songs$Weeks.at.No1.in.UK.The.Guardian)
ch.pos.US.norm <- # that's all for US in the dataset
getNormalizedChartPosition(the.beatles.songs$Chart.position.US.Wikipedia)
the.beatles.songs$Chart.presence.UK <-
getChartPresence(ch.pos.UK.norm, weeks.on.chart.UK.norm, weeks.at.No1.UK.norm,
1, 1, 1) # experiment with different weights
the.beatles.songs$Chart.presence.US <-
getChartPresence(ch.pos.US.norm, # that's all for US in the dataset
rep(0, nrow(the.beatles.songs)), # dummy, for the sake of function format
rep(0, nrow(the.beatles.songs)), # dummy, for the sake of function format
1, 0, 0) # 0 weights for missing data
the.beatles.songs$Chart.presence.UK.and.US <-
getChartPresenceOverall(the.beatles.songs$Chart.presence.UK,
the.beatles.songs$Chart.presence.US,
1, 1) # experiment with different weights
How relevant are the new variables for predicting Billboard Top 50 Beatles songs?
Discretize and factorize them first:
<dataset>$<new factor feature> <-
+ cut(<dataset>$<numeric feature>,
+ breaks = <n>, # number of intervals to cut the <numeric feature> into
+ labels = c("<lab 1>", "<lab 2>", ..., "<lab n>")) # factor labels
the.beatles.songs$Hype.UK <-
cut(the.beatles.songs$Chart.presence.UK,
breaks = 5,
labels = c("Very Low", "Low", "Neutral", "High", "Very High"))
the.beatles.songs$Hype.US <-
cut(the.beatles.songs$Chart.presence.US,
breaks = 5,
labels = c("Very Low", "Low", "Neutral", "High", "Very High"))
the.beatles.songs$Hype.UK.and.US <-
cut(the.beatles.songs$Chart.presence.UK.and.US,
breaks = 5,
labels = c("Very Low", "Low", "Neutral", "High", "Very High"))
And now plot them:
ggplot(the.beatles.songs[1:249, ],
aes(x = Hype.UK, fill = Top.50.Billboard)) +
geom_bar(position = "dodge") +
theme_bw()
ggplot(the.beatles.songs[1:249, ],
aes(x = Hype.US, fill = Top.50.Billboard)) +
geom_bar(position = "dodge") +
theme_bw()
ggplot(the.beatles.songs[1:249, ],
aes(x = Hype.UK.and.US, fill = Top.50.Billboard)) +
geom_bar(position = "dodge") +
theme_bw()
Classification vs. regression:
http://www.simafore.com/blog/bid/62482/2-main-differences-between-classification-and-regression-trees