# install.packages("ggplot2")
library(ggplot2)
Read and the clean the dataset:
source("Prepare The Beatles song dataset for clustering.R")
prepareTheBeatlesDatasetForClustering()
# saveRDS(object = <dataframe or another R object>, file = "<filename>") # save R object for the next session
# <dataframe or another R object> <- readRDS(file = "<filename>") # restore R object in the next session
the.beatles.songs.num <-
readRDS("The Beatles songs dataset (numeric), v4.1.RData")
Check if there are outliers in the data, using boxplots:
boxplot(<dataset>$<column name>, xlab = "<column name>") # basic boxplot for <column name>
boxplot(<dataset>) # basic boxplots for all columns
boxplot(<dataset>)$stats # basic boxplots for all columns, stats
boxplot(<dataset>)$stats[c(1, 5), ] # basic boxplots - whiskers
<output var> <- ggplot(<dataset>, # ggplot2 boxplots
+ aes(x = "",
+ y = <column name>)) + # show boxplot of <column name>
+ geom_boxplot(width = 0.5, fill = "<color>") + # boxplot width and color
+ stat_boxplot(geom ='errorbar', width = 0.15) + # show whiskers, control their width
+ guides(fill = FALSE) + # no legend (it makes no sense here)
+ xlab("") # no x-axis label (it makes no sense here)
Compare outputs of boxplot(
boxplot(the.beatles.songs.num)$stats
## [,1] [,2] [,3] [,4]
## [1,] 80 0 0 0
## [2,] 133 0 0 0
## [3,] 150 9 2 0
## [4,] 173 16 8 0
## [5,] 232 35 20 0
boxplot(the.beatles.songs.num$Duration, xlab = "Duration")
Boxplotting all numeric features using the boxplotFeature() utility function:
source("Boxplotting.R")
boxplotFeature(<dataset with numeric features>, # dataframe with numeric features
+ "<numeric feature>", # numeric feature to boxplot (its name, passed as a string)
+ "<color>") # boxplot fill color
source("Boxplotting.R")
boxplotFeature(the.beatles.songs.num, "Duration", "red")
boxplotFeature(the.beatles.songs.num, "Other.releases", "chartreuse")
boxplotFeature(the.beatles.songs.num, "Covered.by", "orange")
boxplotFeature(the.beatles.songs.num, "Top.50.Billboard", "yellow")
Fix the outlier values for each variable with outliers - replace each outlier value with a specific percentile value of the data, typically 90th or 95th:
boxplot(<dataset>$<column name>, xlab = "<column name>") # basic boxplot for <column name>
boxplot.stats(<dataset>$<column name>) # examine the boxplot more closely
boxplot.stats(<dataset>$<column name>)$out # examine the outliers more closely
boxplot.stats(<dataset>$<column name>)$stats[c(1, 5)] # get the whiskers
boxplot.stats(<dataset>$<column name>)$stats[1] # get the lower whisker
boxplot.stats(<dataset>$<column name>)$stats[5] # get the upper whisker
sort(boxplot.stats(<dataset>$<column name>)$out) # get and sort the outliers
<quantiles> <- quantile(<dataset>$<column name>, # examine the 90th, 95th, ..., percentile
+ probs = seq(from = 0.9, to = 1, by = 0.025))
<new max value> <- # the value to replace the outliers
+ as.numeric(quantile(<dataset>$<column name>, # pick <percentile> closest to
+ probs = <percentile>)) # the upper whisker
<dataset>$<column name>[<dataset>$<column name> >
+ <new max value>] <- # replace the outliers
+ <new max value>
<quantiles> <- quantile(<dataset>$<column name>, # examine the 0th, 5th, ..., percentile
+ probs = seq(from = 0.0, to = 0.1, by = 0.025))
<new min value> <- # the value to replace the outliers
+ as.numeric(quantile(<dataset>$<column name>, # pick <percentile> closest to
+ probs = <percentile>)) # the lower whisker
<dataset>$<column name>[<dataset>$<column name> < <new min value>] <- # replace the outliers
+ <new min value>
Fix the outliers in the first variable manually:
boxplot(the.beatles.songs.num$Duration, xlab = "Duration")
boxplot.stats(the.beatles.songs.num$Duration)
## $stats
## [1] 80 133 150 173 232
##
## $n
## [1] 310
##
## $conf
## [1] 146.4105 153.5895
##
## $out
## [1] 335 237 235 254 258 236 51 266 270 23 431 273 467 388 243 40 66
## [18] 72 236 502 236 245 248 372 286 52 305 241 261 242
boxplot.stats(the.beatles.songs.num$Duration)$out
## [1] 335 237 235 254 258 236 51 266 270 23 431 273 467 388 243 40 66
## [18] 72 236 502 236 245 248 372 286 52 305 241 261 242
boxplot.stats(the.beatles.songs.num$Duration)$stats[c(1, 5)]
## [1] 80 232
sort(boxplot.stats(the.beatles.songs.num$Duration)$out)
## [1] 23 40 51 52 66 72 235 236 236 236 237 241 242 243 245 248 254
## [18] 258 261 266 270 273 286 305 335 372 388 431 467 502
quantile(the.beatles.songs.num$Duration,
probs = seq(from = 0.9, to = 1, by = 0.025))
## 90% 92.5% 95% 97.5% 100%
## 215.200 234.475 244.100 276.575 502.000
new.max.duration <-
as.numeric(quantile(the.beatles.songs.num$Duration, # the 92.5th percentile seems to be
probs = 0.925)) # a good cut-off point
the.beatles.songs.num$Duration[the.beatles.songs.num$Duration > new.max.duration] <- new.max.duration
quantile(the.beatles.songs.num$Duration,
probs = seq(from = 0, to = 0.1, by = 0.025))
## 0% 2.5% 5% 7.5% 10%
## 23.000 91.725 105.450 109.525 116.000
new.min.duration <-
as.numeric(quantile(the.beatles.songs.num$Duration, # the 2.5th percentile seems to be
probs = 0.025)) # a good cut-off point
the.beatles.songs.num$Duration[the.beatles.songs.num$Duration < new.min.duration] <- new.min.duration
boxplot(the.beatles.songs.num$Duration, xlab = "Duration")
# no more outliers in Duration
Call fixOutliers() for the other columns:
# source("Fix outliers.R")
# the.beatles.songs.num <- fixOutliers(the.beatles.songs.num, "<column name>")
source("Fix outliers.R")
boxplot(the.beatles.songs.num$Other.releases, xlab = "Other.releases")
boxplot.stats(the.beatles.songs.num$Other.releases)
## $stats
## [1] 0 0 9 16 35
##
## $n
## [1] 310
##
## $conf
## [1] 7.564192 10.435808
##
## $out
## [1] 42 42 56 45 44
boxplot.stats(the.beatles.songs.num$Other.releases)$out
## [1] 42 42 56 45 44
boxplot.stats(the.beatles.songs.num$Other.releases)$stats[c(1, 5)]
## [1] 0 35
the.beatles.songs.num <- fixOutliers(the.beatles.songs.num, "Other.releases")
boxplot(the.beatles.songs.num$Other.releases, xlab = "Other.releases")
# no more outliers in Other.releases
the.beatles.songs.num <- fixOutliers(the.beatles.songs.num, "Covered.by")
boxplot(the.beatles.songs.num$Covered.by, xlab = "Covered.by")
# no more outliers in Covered.by
Demonstrate an attempt to fix outliers with highly skewed data:
boxplot(the.beatles.songs.num$Top.50.Billboard, xlab = "Top.50.Billboard")
boxplot.stats(the.beatles.songs.num$Top.50.Billboard)
## $stats
## [1] 0 0 0 0 0
##
## $n
## [1] 310
##
## $conf
## [1] 0 0
##
## $out
## [1] 43 1 10 36 14 3 41 45 22 25 30 13 12 47 28 44 37 50 2 40 15 49 8
## [24] 29 46 35 11 6 24 17 32 27 33 9 4 19 48 20 7 21 18 5 23 31 34 38
## [47] 42 26 39
boxplot.stats(the.beatles.songs.num$Top.50.Billboard)$out
## [1] 43 1 10 36 14 3 41 45 22 25 30 13 12 47 28 44 37 50 2 40 15 49 8
## [24] 29 46 35 11 6 24 17 32 27 33 9 4 19 48 20 7 21 18 5 23 31 34 38
## [47] 42 26 39
boxplot.stats(the.beatles.songs.num$Top.50.Billboard)$stats[c(1, 5)]
## [1] 0 0
temp <- the.beatles.songs.num$Top.50.Billboard # save current Top.50.Billboard, for restoring it later
the.beatles.songs.num <- fixOutliers(the.beatles.songs.num, "Top.50.Billboard")
boxplot(the.beatles.songs.num$Top.50.Billboard, xlab = "Top.50.Billboard")
the.beatles.songs.num$Top.50.Billboard <- temp # restore Top.50.Billboard
Summarize the results so far:
summary(the.beatles.songs.num)
## Duration Other.releases Covered.by Top.50.Billboard
## Min. : 91.72 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.:133.00 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.000
## Median :150.00 Median : 9.00 Median : 2.000 Median : 0.000
## Mean :155.84 Mean :10.07 Mean : 5.323 Mean : 4.061
## 3rd Qu.:172.75 3rd Qu.:16.00 3rd Qu.: 8.000 3rd Qu.: 0.000
## Max. :234.47 Max. :30.55 Max. :20.000 Max. :50.000
See if there are some patterns in the data, pairwise, to possibly indicate clusters:
pairs(~ <column 1 name> + <column 2 name> + ...,
+ data = <dataframe>)
pairs(~ Duration + Other.releases + Covered.by + Top.50.Billboard, # no any striking pattern, i.e.
the.beatles.songs.num) # no visual indication of clusters
Try K-Means with 2 variables.
Plot the data first:
<scatterplot> <-
+ ggplot(<dataset>, aes(x = <num.var.1>, y = <num.var.2>)) +
+ geom_point(shape = <n>, # <n> = 1: hollow circle, no fill;
+ # <n> = 21: circle that can be filled
+ fill = <color 1>, # color of point fill (optional)
+ color = <color 2>, # color of point line (optional)
+ size = <s>) # size of point line (optional)
<scatterplot> <- <scatterplot> + xlab("<x label>") # label/caption on x-axis
<scatterplot> <- <scatterplot> + ylab("<y label>") # label/caption on y-axis
<scatterplot> <- <scatterplot> + ggtitle("<scatterplot title>") # scatterplot title
<scatterplot> # plot it
Alternatively:
<scatterplot> <-
+ ggplot(<dataset>, aes(x = <num.var.1>, y = <num.var.2>)) +
+ geom_point(shape = <n>, # <n> = 1: hollow circle, no fill;
+ # <n> = 21: circle that can be filled
+ fill = <color 1>, # color of point fill (optional)
+ color = <color 2>, # color of point line (optional)
+ size = <s>) # size of point line (optional)
<scatterplot> <- <scatterplot> +
+ labs(x = "<x label>", # label/caption on x-axis
+ y = "<y label>", # label/caption on y-axis
+ title = "<scatterplot title>") + # scatterplot title
<scatterplot> # plot it
scatterplot.Other.releases.vs.Covered.by <-
ggplot(the.beatles.songs.num, aes(x = Other.releases, y = Covered.by)) +
geom_point(shape = 21, fill = "yellow", size = 2) +
labs(x = "Other.releases", y = "Covered.by", title = "Covered.by vs. Other.releases") +
theme_bw()
scatterplot.Other.releases.vs.Covered.by
Subset the original data to include only the variables to be used in K-Means:
<new dataframe> <- <dataframe>[, c("<col1 name>", "<col2 name>")]
<new dataframe> <- <dataframe>[, <col1 index>:<col2 index>]
Alternatively:
<new dataframe> <- subset(<dataframe>, select = c("<col1 name>", "<col2 name>"))
<new dataframe> <- subset(<dataframe>, select = c(<col1 index>:<col2 index>))
the.beatles.songs.num.1 <- the.beatles.songs.num[, c("Other.releases", "Covered.by")]
# the.beatles.songs.num.1 <- subset(the.beatles.songs.num, select = c("Other.releases", "Covered.by"))
summary(the.beatles.songs.num.1)
## Other.releases Covered.by
## Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 9.00 Median : 2.000
## Mean :10.07 Mean : 5.323
## 3rd Qu.:16.00 3rd Qu.: 8.000
## Max. :30.55 Max. :20.000
head(the.beatles.songs.num.1)
## Other.releases Covered.by
## 12-Bar Original 0.00 0
## A Day in the Life 12.00 20
## A Hard Day's Night 30.55 20
## A Shot of Rhythm and Blues 0.00 0
## A Taste of Honey 29.00 0
## Across the Universe 19.00 20
Required by K-Means when the variables have different ranges.
range(<dataframe with numeric columns>$<numeric column 1> # check the range of <numeric column 1>
range(<dataframe with numeric columns>$<numeric column 2> # check the range of <numeric column 2>
...
# install.packages("clusterSim")
library(clusterSim)
<dataframe with numeric columns> <- # works with vectors and matrices as well
+ data.Normalization(<dataframe with numeric columns>,
+ type = "n4", # normalization: (x - min(x)) / (max(x) - min(x))
+ normalization = "column") # normalization by columns
range(the.beatles.songs.num.1$Other.releases)
## [1] 0.00 30.55
range(the.beatles.songs.num.1$Covered.by)
## [1] 0 20
library(clusterSim)
## Loading required package: cluster
## Loading required package: MASS
##
## This is package 'modeest' written by P. PONCET.
## For a complete list of functions, use 'library(help = "modeest")' or 'help.start()'.
the.beatles.songs.num.2 <-
data.Normalization(the.beatles.songs.num.1,
type = "n4",
normalization = "column")
tail(the.beatles.songs.num.2)
## Other.releases Covered.by
## You'll Be Mine 0.00000000 0.00
## You're Going to Lose That Girl 0.19639935 0.10
## You've Got to Hide Your Love Away 0.39279869 1.00
## You've Really Got a Hold on Me 0.06546645 0.00
## Young Blood 0.00000000 0.00
## Your Mother Should Know 0.42553191 0.05
Run K-Means for K = 3:
set.seed(<seed>)
<clusters> <- kmeans(x = <normalized dataframe>,
+ centers = 3, # K = 3
+ iter.max = 20, # max number of iterations allowed
+ nstart = 1000) # no. of initial configurations
+ # (report generated based on the best one)
<clusters>
set.seed(888)
clusters.K3 <- kmeans(x = the.beatles.songs.num.2, centers = 3, iter.max = 20, nstart = 1000)
clusters.K3
## K-means clustering with 3 clusters of sizes 103, 149, 58
##
## Cluster means:
## Other.releases Covered.by
## 1 0.53862044 0.2189320
## 2 0.08633662 0.0466443
## 3 0.58352616 0.9137931
##
## Clustering vector:
## 12-Bar Original
## 2
## A Day in the Life
## 3
## A Hard Day's Night
## 3
## A Shot of Rhythm and Blues
## 2
## A Taste of Honey
## 1
## Across the Universe
## 3
## Act Naturally
## 1
## Ain't She Sweet
## 2
## All I've Got to Do
## 2
## All My Loving
## 3
## All Things Must Pass
## 2
## All Together Now
## 2
## All You Need Is Love
## 3
## And I Love Her
## 3
## And Your Bird Can Sing
## 1
## Anna (Go to Him)
## 1
## Another Girl
## 2
## Any Time at All
## 1
## Ask Me Why
## 1
## Baby It's You
## 1
## Baby's in Black
## 1
## Baby, You're a Rich Man
## 1
## Back in the U.S.S.R.
## 3
## Bad Boy
## 2
## Bad to Me
## 2
## Beautiful Dreamer
## 2
## Because I Know You Love Me So
## 2
## Because
## 3
## Being for the Benefit of Mr. Kite!
## 1
## Birthday
## 1
## Blackbird
## 3
## Blue Jay Way
## 1
## Boys
## 1
## Bésame Mucho
## 2
## Can't Buy Me Love
## 3
## Carol
## 2
## Carry That Weight
## 2
## Catswalk
## 2
## Cayenne
## 2
## Chains
## 1
## Child of Nature
## 2
## Christmas Time (Is Here Again)
## 2
## Circles
## 2
## Clarabella
## 2
## Come and Get It
## 2
## Come Together
## 3
## Cry Baby Cry
## 1
## Cry for a Shadow
## 2
## Crying, Waiting, Hoping
## 2
## Day Tripper
## 3
## Dear Prudence
## 3
## Devil in Her Heart
## 2
## Dig a Pony
## 1
## Dig It
## 2
## Dizzy, Miss Lizzy
## 1
## Do You Want to Know a Secret?
## 1
## Doctor Robert
## 2
## Don't Bother Me
## 1
## Don't Ever Change
## 2
## Don't Let Me Down
## 3
## Don't Pass Me By
## 1
## Drive My Car
## 3
## Eight Days a Week
## 1
## Eleanor Rigby
## 3
## Etcetera
## 2
## Every Little Thing
## 2
## Everybody's Got Something to Hide Except Me and My Monkey
## 1
## Everybody's Trying to Be My Baby
## 1
## Fancy My Chances with You
## 2
## Fixing a Hole
## 2
## Flying
## 1
## For No One
## 1
## For You Blue
## 1
## Free as a Bird
## 2
## From Me to You
## 1
## From Us to You
## 2
## Get Back
## 3
## Getting Better
## 1
## Girl
## 1
## Glad All Over
## 2
## Glass Onion
## 2
## Golden Slumbers
## 3
## Good Day Sunshine
## 2
## Good Morning, Good Morning
## 2
## Good Night
## 1
## Goodbye
## 2
## Got to Get You into My Life
## 1
## Hallelujah, I Love Her So
## 2
## Happiness Is a Warm Gun
## 1
## Heather
## 2
## Hello Little Girl
## 2
## Hello, Goodbye
## 1
## Help!
## 3
## Helter Skelter
## 3
## Her Majesty
## 2
## Here Comes the Sun
## 3
## Here, There and Everywhere
## 3
## Hey Bulldog
## 1
## Hey Jude
## 3
## Hippy Hippy Shake
## 2
## Hold Me Tight
## 1
## Honey Don't
## 1
## Honey Pie
## 2
## How Do You Do It?
## 2
## I Am the Walrus
## 3
## I Call Your Name
## 1
## I Don't Want to Spoil the Party
## 1
## I Feel Fine
## 1
## I Forgot to Remember to Forget
## 2
## I Got a Woman
## 2
## I Got to Find My Baby
## 2
## I Just Don't Understand
## 2
## I Lost My Little Girl
## 2
## I Me Mine
## 2
## I Need You
## 1
## I Saw Her Standing There
## 3
## I Should Have Known Better
## 1
## I Wanna Be Your Man
## 1
## I Want to Hold Your Hand
## 3
## I Want to Tell You
## 2
## I Want You (She's So Heavy)
## 3
## I Will
## 3
## I'll Be Back
## 1
## I'll Be on My Way
## 2
## I'll Cry Instead
## 1
## I'll Follow the Sun
## 1
## I'll Get You
## 1
## I'll Keep You Satisfied
## 2
## I'm a Loser
## 1
## I'm Down
## 1
## I'm Gonna Sit Right Down and Cry (Over You)
## 2
## I'm Happy Just to Dance with You
## 2
## I'm In Love
## 2
## I'm Looking Through You
## 1
## I'm Only Sleeping
## 1
## I'm So Tired
## 1
## I'm Talking About You (Star Club)
## 2
## I'm Talking About You (BBC)
## 2
## I've Got a Feeling
## 1
## I've Just Seen a Face
## 3
## If I Fell
## 3
## If I Needed Someone
## 1
## If You've Got Trouble
## 2
## In My Life
## 3
## In Spite of All the Danger
## 2
## It Won't Be Long
## 1
## It's All Too Much
## 1
## It's Only Love
## 1
## Jazz Piano Song
## 2
## Jessie's Dream
## 2
## Johnny B. Goode
## 2
## Julia
## 1
## Junk
## 2
## Kansas City/Hey, Hey, Hey, Hey
## 2
## Keep Your Hands Off My Baby
## 2
## Komm Gib Mir Deine Hand
## 2
## Lady Madonna
## 3
## Leave My Kitten Alone
## 2
## Lend Me Your Comb
## 2
## Let It Be
## 3
## Like Dreamers Do
## 2
## Little Child
## 1
## Lonesome Tears in My Eyes
## 2
## Long Tall Sally
## 1
## Long, Long, Long
## 2
## Looking Glass
## 2
## Love Me Do
## 1
## Love of the Loved
## 2
## Love You To
## 2
## Lovely Rita
## 2
## Lucille
## 2
## Lucy in the Sky with Diamonds
## 3
## Madman
## 2
## Maggie Mae
## 2
## Magical Mystery Tour
## 1
## Mailman, Bring Me No More Blues
## 2
## Martha My Dear
## 2
## Matchbox
## 1
## Maxwell's Silver Hammer
## 2
## Mean Mr. Mustard
## 2
## Memphis, Tennessee
## 2
## Michelle
## 3
## Misery
## 1
## Money (That's What I Want)
## 1
## Moonlight Bay
## 2
## Mother Nature's Son
## 1
## Mr. Moonlight
## 1
## My Bonnie
## 1
## No Reply
## 1
## Norwegian Wood (This Bird Has Flown)
## 3
## Not a Second Time
## 2
## Not Guilty
## 2
## Nothin' Shakin' (But the Leaves on the Trees)
## 2
## Nowhere Man
## 3
## Ob-La-Di, Ob-La-Da
## 3
## Octopus's Garden
## 1
## Oh! Darling
## 3
## Old Brown Shoe
## 1
## One After 909
## 1
## One and One Is Two
## 2
## Only a Northern Song
## 2
## Ooh! My Soul
## 2
## P.S. I Love You
## 1
## Paperback Writer
## 1
## Penny Lane
## 3
## Piggies
## 2
## Please Mr. Postman
## 1
## Please Please Me
## 1
## Polythene Pam
## 2
## Rain
## 3
## Real Love
## 2
## Revolution 1
## 2
## Revolution 9
## 2
## Revolution
## 3
## Rip It Up/Shake, Rattle, and Roll/Blue Suede Shoes
## 2
## Rock and Roll Music
## 1
## Rocky Raccoon
## 1
## Roll Over Beethoven
## 1
## Run for Your Life
## 1
## Savoy Truffle
## 2
## Searchin'
## 2
## September in the Rain
## 2
## Sexy Sadie
## 1
## Sgt. Pepper's Lonely Hearts Club Band (Reprise)
## 2
## Sgt. Pepper's Lonely Hearts Club Band
## 3
## Shakin' in the Sixties
## 2
## She Came in Through the Bathroom Window
## 2
## She Loves You
## 3
## She Said She Said
## 2
## She's a Woman
## 1
## She's Leaving Home
## 3
## Shout
## 2
## Sie Liebt Dich
## 2
## Slow Down
## 1
## So How Come (No One Loves Me)
## 2
## Soldier of Love (Lay Down Your Arms)
## 2
## Some Other Guy
## 2
## Something
## 3
## Sour Milk Sea
## 2
## Step Inside Love/Los Paranoias
## 2
## Strawberry Fields Forever
## 3
## Sun King
## 2
## Sure to Fall (In Love with You)
## 2
## Sweet Little Sixteen
## 2
## Take Good Care of My Baby
## 2
## Taking a Trip to Carolina
## 2
## Taxman
## 3
## Teddy Boy
## 2
## Tell Me What You See
## 2
## Tell Me Why
## 1
## Thank You Girl
## 1
## That Means a Lot
## 2
## That'll Be the Day
## 2
## That's All Right (Mama)
## 2
## The Ballad of John and Yoko
## 1
## The Continuing Story of Bungalow Bill
## 2
## The End
## 2
## The Fool on the Hill
## 3
## The Honeymoon Song
## 2
## The Inner Light
## 2
## The Long and Winding Road
## 3
## The Night Before
## 1
## The Saints
## 2
## The Sheik of Araby
## 2
## The Word
## 2
## There's a Place
## 1
## Things We Said Today
## 1
## Think for Yourself
## 2
## This Boy
## 1
## Three Cool Cats
## 2
## Ticket to Ride
## 3
## Till There Was You
## 1
## Tip of My Tongue
## 2
## To Know Her is to Love Her
## 2
## Tomorrow Never Knows
## 3
## Too Much Monkey Business
## 2
## Twist and Shout
## 1
## Two of Us
## 1
## Wait
## 2
## Watching Rainbows
## 2
## We Can Work It Out
## 3
## What Goes On
## 2
## What You're Doing
## 2
## What's The New Mary Jane
## 2
## When I Get Home
## 1
## When I'm Sixty-Four
## 3
## While My Guitar Gently Weeps
## 3
## Why Don't We Do It in the Road?
## 1
## Wild Honey Pie
## 2
## Winston's Walk
## 2
## With a Little Help from My Friends
## 3
## Within You Without You
## 1
## Woman
## 2
## Words of Love
## 1
## Yellow Submarine
## 1
## Yer Blues
## 1
## Yes It Is
## 1
## Yesterday
## 3
## You Can't Do That
## 1
## You Know My Name (Look Up the Number)
## 2
## You Know What to Do
## 2
## You Like Me Too Much
## 1
## You Never Give Me Your Money
## 2
## You Won't See Me
## 1
## You'll Be Mine
## 2
## You're Going to Lose That Girl
## 2
## You've Got to Hide Your Love Away
## 3
## You've Really Got a Hold on Me
## 2
## Young Blood
## 2
## Your Mother Should Know
## 1
##
## Within cluster sum of squares by cluster:
## [1] 8.245325 3.317424 5.057079
## (between_SS / total_SS = 74.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
The meaning of parameters in the report:
Add the vector of clusters to the dataframe:
<normalized dataframe>$<new column> <- factor(<clusters>$cluster) # <clusters>: from the previous step
head(<normalized dataframe>)
the.beatles.songs.num.2$Cluster <- factor(clusters.K3$cluster)
head(the.beatles.songs.num.2)
## Other.releases Covered.by Cluster
## 12-Bar Original 0.0000000 0 2
## A Day in the Life 0.3927987 1 3
## A Hard Day's Night 1.0000000 1 3
## A Shot of Rhythm and Blues 0.0000000 0 2
## A Taste of Honey 0.9492635 0 1
## Across the Universe 0.6219313 1 3
Plot the clusters in a new scatterplot, using plotClusters() utility function:
source("Plot clusters.R")
plotClusters() <- function(dataset, # dataset with the cluster column
+ xcol, # dataset column for the x-axis, passed as a string
+ ycol, # dataset column for the y-axis, passed as a string
+ clustercol, # dataset column showing the clusters, passed as a string
+ title, # plot title
+ x.label, # x-axis label
+ y.label, # y-axis label
+ legend.label, # plot legend label
+ show.centers.flag, # plot cluster centers if TRUE
+ clusters) { # clusters computed by kmeans() in a previous step
source("Plot clusters.R")
plotClusters(the.beatles.songs.num.2,
"Other.releases",
"Covered.by",
"Cluster",
"Clusters: (Other.releases, Covered.by), normalized",
"Other.releases",
"Covered.by",
"Cluster",
TRUE,
clusters.K3)
Find the optimal value for K, using the Elbow method (a call to the appropriate utility function); an appropriate dataframe should be passed as the parameter (just numeric variables, no clusters):
source("Elbow method.R")
<elbow parameters> <- getElbowMethodParameters(<dataframe>[, c(<n1>, <n2>, ...)]) # leave out the cluster column
<elbow parameters>
plotElbow(<elbow parameters>)
source("Elbow method.R")
elbow.2 <- getElbowMethodParameters(the.beatles.songs.num.2[, c(1,2)]) # remove the Cluster column
elbow.2
## cluster tot.withinss between_SS / total_SS
## 1 2 27.784582 0.5752468
## 2 3 16.619828 0.7459265
## 3 4 11.367590 0.8262194
## 4 5 8.106076 0.8760794
## 5 6 6.345421 0.9029952
## 6 7 5.040072 0.9229506
## 7 8 4.171462 0.9362293
plotElbow(elbow.2)
Show differences in tot.withinss for different values of K more precisely, using the getDifferences() utility function:
source("Elbow method.R")
<diff dataframe> <-
+ data.frame(K = <n1>:<n2>,
+ diff.tot.withinss =
+ getDifferences(<elbow stats>$<tot.withinss>), # from the previous step
+ diff.ratio =
+ getDifferences(<elbow stats>$<ratio between_SS / total_SS>)) # from the previous step
names(<diff dataframe>) <- c("K", "Difference in tot.withinss", "Difference in between_SS / total_SS")
<diff dataframe>
df.differences <- data.frame(K = 2:8,
getDifferences(elbow.2[, 2]),
getDifferences(elbow.2[, 3]))
names(df.differences) <- c("K", "Difference in tot.withinss", "Difference in between_SS / total_SS")
df.differences
## K Difference in tot.withinss Difference in between_SS / total_SS
## 1 2 NA NA
## 2 3 11.1647543 0.17067974
## 3 4 5.2522373 0.08029290
## 4 5 3.2615145 0.04985998
## 5 6 1.7606546 0.02691578
## 6 7 1.3053496 0.01995536
## 7 8 0.8686101 0.01327877
Run K-Means also for K = 4:
set.seed(818)
the.beatles.songs.num.2 <- the.beatles.songs.num.2[, -3] # remove the Cluster column for the new run
clusters.K4 <- kmeans(x = the.beatles.songs.num.2, centers = 4, iter.max = 20, nstart = 1000)
clusters.K4
## K-means clustering with 4 clusters of sizes 58, 98, 41, 113
##
## Cluster means:
## Other.releases Covered.by
## 1 0.588605452 0.912931034
## 2 0.003340125 0.005102041
## 3 0.767115085 0.186585366
## 4 0.320959402 0.189380531
##
## Clustering vector:
## 12-Bar Original
## 2
## A Day in the Life
## 1
## A Hard Day's Night
## 1
## A Shot of Rhythm and Blues
## 2
## A Taste of Honey
## 3
## Across the Universe
## 1
## Act Naturally
## 4
## Ain't She Sweet
## 4
## All I've Got to Do
## 4
## All My Loving
## 1
## All Things Must Pass
## 2
## All Together Now
## 4
## All You Need Is Love
## 1
## And I Love Her
## 1
## And Your Bird Can Sing
## 4
## Anna (Go to Him)
## 4
## Another Girl
## 4
## Any Time at All
## 4
## Ask Me Why
## 3
## Baby It's You
## 3
## Baby's in Black
## 3
## Baby, You're a Rich Man
## 4
## Back in the U.S.S.R.
## 1
## Bad Boy
## 2
## Bad to Me
## 2
## Beautiful Dreamer
## 2
## Because I Know You Love Me So
## 2
## Because
## 1
## Being for the Benefit of Mr. Kite!
## 4
## Birthday
## 4
## Blackbird
## 1
## Blue Jay Way
## 4
## Boys
## 3
## Bésame Mucho
## 2
## Can't Buy Me Love
## 1
## Carol
## 2
## Carry That Weight
## 4
## Catswalk
## 2
## Cayenne
## 2
## Chains
## 3
## Child of Nature
## 2
## Christmas Time (Is Here Again)
## 2
## Circles
## 2
## Clarabella
## 2
## Come and Get It
## 2
## Come Together
## 1
## Cry Baby Cry
## 4
## Cry for a Shadow
## 2
## Crying, Waiting, Hoping
## 2
## Day Tripper
## 1
## Dear Prudence
## 1
## Devil in Her Heart
## 4
## Dig a Pony
## 4
## Dig It
## 4
## Dizzy, Miss Lizzy
## 3
## Do You Want to Know a Secret?
## 3
## Doctor Robert
## 4
## Don't Bother Me
## 4
## Don't Ever Change
## 2
## Don't Let Me Down
## 1
## Don't Pass Me By
## 4
## Drive My Car
## 1
## Eight Days a Week
## 3
## Eleanor Rigby
## 1
## Etcetera
## 2
## Every Little Thing
## 4
## Everybody's Got Something to Hide Except Me and My Monkey
## 4
## Everybody's Trying to Be My Baby
## 3
## Fancy My Chances with You
## 2
## Fixing a Hole
## 4
## Flying
## 4
## For No One
## 4
## For You Blue
## 4
## Free as a Bird
## 2
## From Me to You
## 3
## From Us to You
## 2
## Get Back
## 1
## Getting Better
## 4
## Girl
## 1
## Glad All Over
## 2
## Glass Onion
## 4
## Golden Slumbers
## 1
## Good Day Sunshine
## 4
## Good Morning, Good Morning
## 4
## Good Night
## 4
## Goodbye
## 2
## Got to Get You into My Life
## 4
## Hallelujah, I Love Her So
## 2
## Happiness Is a Warm Gun
## 4
## Heather
## 2
## Hello Little Girl
## 2
## Hello, Goodbye
## 4
## Help!
## 1
## Helter Skelter
## 1
## Her Majesty
## 4
## Here Comes the Sun
## 1
## Here, There and Everywhere
## 1
## Hey Bulldog
## 4
## Hey Jude
## 1
## Hippy Hippy Shake
## 2
## Hold Me Tight
## 4
## Honey Don't
## 4
## Honey Pie
## 4
## How Do You Do It?
## 2
## I Am the Walrus
## 1
## I Call Your Name
## 4
## I Don't Want to Spoil the Party
## 4
## I Feel Fine
## 3
## I Forgot to Remember to Forget
## 2
## I Got a Woman
## 2
## I Got to Find My Baby
## 2
## I Just Don't Understand
## 2
## I Lost My Little Girl
## 2
## I Me Mine
## 4
## I Need You
## 4
## I Saw Her Standing There
## 1
## I Should Have Known Better
## 3
## I Wanna Be Your Man
## 3
## I Want to Hold Your Hand
## 1
## I Want to Tell You
## 4
## I Want You (She's So Heavy)
## 1
## I Will
## 4
## I'll Be Back
## 4
## I'll Be on My Way
## 2
## I'll Cry Instead
## 4
## I'll Follow the Sun
## 3
## I'll Get You
## 4
## I'll Keep You Satisfied
## 2
## I'm a Loser
## 3
## I'm Down
## 3
## I'm Gonna Sit Right Down and Cry (Over You)
## 2
## I'm Happy Just to Dance with You
## 4
## I'm In Love
## 2
## I'm Looking Through You
## 4
## I'm Only Sleeping
## 4
## I'm So Tired
## 4
## I'm Talking About You (Star Club)
## 2
## I'm Talking About You (BBC)
## 2
## I've Got a Feeling
## 4
## I've Just Seen a Face
## 1
## If I Fell
## 1
## If I Needed Someone
## 4
## If You've Got Trouble
## 2
## In My Life
## 1
## In Spite of All the Danger
## 2
## It Won't Be Long
## 4
## It's All Too Much
## 4
## It's Only Love
## 4
## Jazz Piano Song
## 2
## Jessie's Dream
## 2
## Johnny B. Goode
## 2
## Julia
## 4
## Junk
## 2
## Kansas City/Hey, Hey, Hey, Hey
## 4
## Keep Your Hands Off My Baby
## 2
## Komm Gib Mir Deine Hand
## 4
## Lady Madonna
## 1
## Leave My Kitten Alone
## 2
## Lend Me Your Comb
## 2
## Let It Be
## 1
## Like Dreamers Do
## 2
## Little Child
## 4
## Lonesome Tears in My Eyes
## 2
## Long Tall Sally
## 3
## Long, Long, Long
## 4
## Looking Glass
## 2
## Love Me Do
## 3
## Love of the Loved
## 2
## Love You To
## 4
## Lovely Rita
## 4
## Lucille
## 2
## Lucy in the Sky with Diamonds
## 1
## Madman
## 2
## Maggie Mae
## 4
## Magical Mystery Tour
## 3
## Mailman, Bring Me No More Blues
## 2
## Martha My Dear
## 4
## Matchbox
## 3
## Maxwell's Silver Hammer
## 4
## Mean Mr. Mustard
## 4
## Memphis, Tennessee
## 2
## Michelle
## 1
## Misery
## 3
## Money (That's What I Want)
## 3
## Moonlight Bay
## 2
## Mother Nature's Son
## 4
## Mr. Moonlight
## 3
## My Bonnie
## 4
## No Reply
## 4
## Norwegian Wood (This Bird Has Flown)
## 1
## Not a Second Time
## 4
## Not Guilty
## 2
## Nothin' Shakin' (But the Leaves on the Trees)
## 2
## Nowhere Man
## 1
## Ob-La-Di, Ob-La-Da
## 1
## Octopus's Garden
## 4
## Oh! Darling
## 1
## Old Brown Shoe
## 4
## One After 909
## 3
## One and One Is Two
## 2
## Only a Northern Song
## 4
## Ooh! My Soul
## 2
## P.S. I Love You
## 3
## Paperback Writer
## 3
## Penny Lane
## 1
## Piggies
## 4
## Please Mr. Postman
## 3
## Please Please Me
## 3
## Polythene Pam
## 4
## Rain
## 1
## Real Love
## 2
## Revolution 1
## 4
## Revolution 9
## 4
## Revolution
## 1
## Rip It Up/Shake, Rattle, and Roll/Blue Suede Shoes
## 2
## Rock and Roll Music
## 3
## Rocky Raccoon
## 4
## Roll Over Beethoven
## 3
## Run for Your Life
## 4
## Savoy Truffle
## 4
## Searchin'
## 2
## September in the Rain
## 2
## Sexy Sadie
## 4
## Sgt. Pepper's Lonely Hearts Club Band (Reprise)
## 4
## Sgt. Pepper's Lonely Hearts Club Band
## 1
## Shakin' in the Sixties
## 2
## She Came in Through the Bathroom Window
## 4
## She Loves You
## 1
## She Said She Said
## 4
## She's a Woman
## 3
## She's Leaving Home
## 1
## Shout
## 2
## Sie Liebt Dich
## 4
## Slow Down
## 3
## So How Come (No One Loves Me)
## 2
## Soldier of Love (Lay Down Your Arms)
## 2
## Some Other Guy
## 2
## Something
## 1
## Sour Milk Sea
## 2
## Step Inside Love/Los Paranoias
## 2
## Strawberry Fields Forever
## 1
## Sun King
## 4
## Sure to Fall (In Love with You)
## 2
## Sweet Little Sixteen
## 2
## Take Good Care of My Baby
## 2
## Taking a Trip to Carolina
## 2
## Taxman
## 1
## Teddy Boy
## 2
## Tell Me What You See
## 4
## Tell Me Why
## 4
## Thank You Girl
## 3
## That Means a Lot
## 2
## That'll Be the Day
## 2
## That's All Right (Mama)
## 2
## The Ballad of John and Yoko
## 4
## The Continuing Story of Bungalow Bill
## 4
## The End
## 4
## The Fool on the Hill
## 1
## The Honeymoon Song
## 2
## The Inner Light
## 4
## The Long and Winding Road
## 1
## The Night Before
## 4
## The Saints
## 2
## The Sheik of Araby
## 2
## The Word
## 4
## There's a Place
## 3
## Things We Said Today
## 3
## Think for Yourself
## 4
## This Boy
## 3
## Three Cool Cats
## 2
## Ticket to Ride
## 1
## Till There Was You
## 3
## Tip of My Tongue
## 2
## To Know Her is to Love Her
## 2
## Tomorrow Never Knows
## 1
## Too Much Monkey Business
## 2
## Twist and Shout
## 3
## Two of Us
## 4
## Wait
## 4
## Watching Rainbows
## 2
## We Can Work It Out
## 1
## What Goes On
## 4
## What You're Doing
## 4
## What's The New Mary Jane
## 2
## When I Get Home
## 4
## When I'm Sixty-Four
## 1
## While My Guitar Gently Weeps
## 1
## Why Don't We Do It in the Road?
## 4
## Wild Honey Pie
## 4
## Winston's Walk
## 2
## With a Little Help from My Friends
## 1
## Within You Without You
## 4
## Woman
## 2
## Words of Love
## 4
## Yellow Submarine
## 3
## Yer Blues
## 4
## Yes It Is
## 4
## Yesterday
## 1
## You Can't Do That
## 3
## You Know My Name (Look Up the Number)
## 2
## You Know What to Do
## 2
## You Like Me Too Much
## 4
## You Never Give Me Your Money
## 4
## You Won't See Me
## 4
## You'll Be Mine
## 2
## You're Going to Lose That Girl
## 4
## You've Got to Hide Your Love Away
## 1
## You've Really Got a Hold on Me
## 2
## Young Blood
## 2
## Your Mother Should Know
## 4
##
## Within cluster sum of squares by cluster:
## [1] 4.9866854 0.1313566 2.3811301 3.8684182
## (between_SS / total_SS = 82.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
the.beatles.songs.num.2$Cluster <- factor(clusters.K4$cluster)
head(the.beatles.songs.num.2)
## Other.releases Covered.by Cluster
## 12-Bar Original 0.0000000 0 2
## A Day in the Life 0.3927987 1 1
## A Hard Day's Night 1.0000000 1 1
## A Shot of Rhythm and Blues 0.0000000 0 2
## A Taste of Honey 0.9492635 0 3
## Across the Universe 0.6219313 1 1
source("Plot clusters.R")
plotClusters(the.beatles.songs.num.2,
"Other.releases",
"Covered.by",
"Cluster",
"Clusters: (Other.releases, Covered.by), normalized",
"Other.releases",
"Covered.by",
"Cluster",
TRUE,
clusters.K4)
Examine clusters more closely by looking into the cluster centers (means) and standard deviations from the centers. In doing so, use ‘regular’ (not normalized) features and the summarizeClusterStats() utility function:
source("Summary statistics about clusters.R")
<stats dataframe> <-
+ summarizeClusterStats(feature.set = <dataframe with 'regular' (not normalized) features>,
+ clusters = <clusters>$cluster, # <clusters>: result of kmeans()
+ cl.num = <K>) # <K>: number of clusters
Compare dispersion (sd) in this representation of the data and in the corresponding plots.`
source("Summary statistics about clusters.R")
clusters.K3.stats <- summarizeClusterStats(feature.set = the.beatles.songs.num.1,
clusters = clusters.K3$cluster, cl.num = 3)
clusters.K4.stats <- summarizeClusterStats(feature.set = the.beatles.songs.num.1,
clusters = clusters.K4$cluster, cl.num = 4)
Data normalization:
library(clusterSim)
the.beatles.songs.num.4 <-
data.Normalization(the.beatles.songs.num,
type = "n4",
normalization = "column")
tail(the.beatles.songs.num.4)
## Duration Other.releases Covered.by
## You'll Be Mine 0.04395797 0.00000000 0.00
## You're Going to Lose That Girl 0.33817863 0.19639935 0.10
## You've Got to Hide Your Love Away 0.27513135 0.39279869 1.00
## You've Really Got a Hold on Me 0.63239930 0.06546645 0.00
## Young Blood 0.17005254 0.00000000 0.00
## Your Mother Should Know 0.40122592 0.42553191 0.05
## Top.50.Billboard
## You'll Be Mine 0
## You're Going to Lose That Girl 0
## You've Got to Hide Your Love Away 0
## You've Really Got a Hold on Me 0
## Young Blood 0
## Your Mother Should Know 0
Find the optimal value for K, using the Elbow method (a call to the appropriate utility function):
source("Elbow method.R")
elbow.4 <- getElbowMethodParameters(the.beatles.songs.num.4)
elbow.4
## cluster tot.withinss between_SS / total_SS
## 1 2 57.11964 0.4301936
## 2 3 45.92685 0.5418492
## 3 4 35.76960 0.6431745
## 4 5 27.82075 0.7224695
## 5 6 24.28521 0.7577389
## 6 7 21.46786 0.7858438
## 7 8 19.09187 0.8095459
plotElbow(elbow.4)
Show differences in tot.withinss for different values of K more precisely, using the getDifferences() utility function:
df.differences <- data.frame(K = 2:8,
getDifferences(elbow.4[, 2]),
getDifferences(elbow.4[, 3]))
names(df.differences) <- c("K", "Difference in tot.withinss", "Difference in between_SS / total_SS")
df.differences
## K Difference in tot.withinss Difference in between_SS / total_SS
## 1 2 NA NA
## 2 3 11.192793 0.11165555
## 3 4 10.157254 0.10132536
## 4 5 7.948846 0.07929502
## 5 6 3.535538 0.03526934
## 6 7 2.817348 0.02810492
## 7 8 2.375991 0.02370209
Run K-Means for K = 3, since K = 3 seems to be the best value for K:
set.seed(888)
clusters.K3.all.vars <- kmeans(x = the.beatles.songs.num.4, centers = 3, iter.max = 20, nstart = 1000)
clusters.K3.all.vars
## K-means clustering with 3 clusters of sizes 112, 135, 63
##
## Cluster means:
## Duration Other.releases Covered.by Top.50.Billboard
## 1 0.4690799 0.48754968 0.20446429 0.035000000
## 2 0.3746786 0.06789113 0.03518519 0.003851852
## 3 0.5731271 0.60977321 0.87063492 0.329206349
##
## Clustering vector:
## 12-Bar Original
## 2
## A Day in the Life
## 3
## A Hard Day's Night
## 3
## A Shot of Rhythm and Blues
## 2
## A Taste of Honey
## 1
## Across the Universe
## 3
## Act Naturally
## 1
## Ain't She Sweet
## 2
## All I've Got to Do
## 2
## All My Loving
## 3
## All Things Must Pass
## 2
## All Together Now
## 2
## All You Need Is Love
## 3
## And I Love Her
## 3
## And Your Bird Can Sing
## 1
## Anna (Go to Him)
## 1
## Another Girl
## 2
## Any Time at All
## 1
## Ask Me Why
## 1
## Baby It's You
## 1
## Baby's in Black
## 1
## Baby, You're a Rich Man
## 1
## Back in the U.S.S.R.
## 3
## Bad Boy
## 2
## Bad to Me
## 2
## Beautiful Dreamer
## 2
## Because I Know You Love Me So
## 2
## Because
## 3
## Being for the Benefit of Mr. Kite!
## 1
## Birthday
## 1
## Blackbird
## 3
## Blue Jay Way
## 1
## Boys
## 1
## Bésame Mucho
## 2
## Can't Buy Me Love
## 3
## Carol
## 2
## Carry That Weight
## 2
## Catswalk
## 2
## Cayenne
## 2
## Chains
## 1
## Child of Nature
## 2
## Christmas Time (Is Here Again)
## 2
## Circles
## 2
## Clarabella
## 2
## Come and Get It
## 2
## Come Together
## 3
## Cry Baby Cry
## 1
## Cry for a Shadow
## 2
## Crying, Waiting, Hoping
## 2
## Day Tripper
## 3
## Dear Prudence
## 3
## Devil in Her Heart
## 1
## Dig a Pony
## 1
## Dig It
## 2
## Dizzy, Miss Lizzy
## 1
## Do You Want to Know a Secret?
## 1
## Doctor Robert
## 2
## Don't Bother Me
## 1
## Don't Ever Change
## 2
## Don't Let Me Down
## 3
## Don't Pass Me By
## 1
## Drive My Car
## 1
## Eight Days a Week
## 1
## Eleanor Rigby
## 3
## Etcetera
## 2
## Every Little Thing
## 2
## Everybody's Got Something to Hide Except Me and My Monkey
## 1
## Everybody's Trying to Be My Baby
## 1
## Fancy My Chances with You
## 2
## Fixing a Hole
## 2
## Flying
## 1
## For No One
## 1
## For You Blue
## 1
## Free as a Bird
## 2
## From Me to You
## 1
## From Us to You
## 2
## Get Back
## 3
## Getting Better
## 1
## Girl
## 1
## Glad All Over
## 2
## Glass Onion
## 2
## Golden Slumbers
## 3
## Good Day Sunshine
## 2
## Good Morning, Good Morning
## 1
## Good Night
## 1
## Goodbye
## 2
## Got to Get You into My Life
## 3
## Hallelujah, I Love Her So
## 2
## Happiness Is a Warm Gun
## 1
## Heather
## 2
## Hello Little Girl
## 2
## Hello, Goodbye
## 3
## Help!
## 3
## Helter Skelter
## 3
## Her Majesty
## 2
## Here Comes the Sun
## 3
## Here, There and Everywhere
## 3
## Hey Bulldog
## 1
## Hey Jude
## 3
## Hippy Hippy Shake
## 2
## Hold Me Tight
## 1
## Honey Don't
## 1
## Honey Pie
## 1
## How Do You Do It?
## 2
## I Am the Walrus
## 3
## I Call Your Name
## 1
## I Don't Want to Spoil the Party
## 1
## I Feel Fine
## 3
## I Forgot to Remember to Forget
## 2
## I Got a Woman
## 2
## I Got to Find My Baby
## 2
## I Just Don't Understand
## 2
## I Lost My Little Girl
## 2
## I Me Mine
## 1
## I Need You
## 1
## I Saw Her Standing There
## 3
## I Should Have Known Better
## 1
## I Wanna Be Your Man
## 1
## I Want to Hold Your Hand
## 3
## I Want to Tell You
## 2
## I Want You (She's So Heavy)
## 3
## I Will
## 1
## I'll Be Back
## 1
## I'll Be on My Way
## 2
## I'll Cry Instead
## 1
## I'll Follow the Sun
## 1
## I'll Get You
## 1
## I'll Keep You Satisfied
## 2
## I'm a Loser
## 1
## I'm Down
## 1
## I'm Gonna Sit Right Down and Cry (Over You)
## 2
## I'm Happy Just to Dance with You
## 2
## I'm In Love
## 2
## I'm Looking Through You
## 1
## I'm Only Sleeping
## 1
## I'm So Tired
## 1
## I'm Talking About You (Star Club)
## 2
## I'm Talking About You (BBC)
## 2
## I've Got a Feeling
## 1
## I've Just Seen a Face
## 3
## If I Fell
## 3
## If I Needed Someone
## 1
## If You've Got Trouble
## 2
## In My Life
## 3
## In Spite of All the Danger
## 2
## It Won't Be Long
## 1
## It's All Too Much
## 1
## It's Only Love
## 1
## Jazz Piano Song
## 2
## Jessie's Dream
## 2
## Johnny B. Goode
## 2
## Julia
## 1
## Junk
## 2
## Kansas City/Hey, Hey, Hey, Hey
## 2
## Keep Your Hands Off My Baby
## 2
## Komm Gib Mir Deine Hand
## 2
## Lady Madonna
## 3
## Leave My Kitten Alone
## 2
## Lend Me Your Comb
## 2
## Let It Be
## 3
## Like Dreamers Do
## 2
## Little Child
## 2
## Lonesome Tears in My Eyes
## 2
## Long Tall Sally
## 1
## Long, Long, Long
## 2
## Looking Glass
## 2
## Love Me Do
## 3
## Love of the Loved
## 2
## Love You To
## 2
## Lovely Rita
## 1
## Lucille
## 2
## Lucy in the Sky with Diamonds
## 3
## Madman
## 2
## Maggie Mae
## 2
## Magical Mystery Tour
## 1
## Mailman, Bring Me No More Blues
## 2
## Martha My Dear
## 2
## Matchbox
## 1
## Maxwell's Silver Hammer
## 1
## Mean Mr. Mustard
## 2
## Memphis, Tennessee
## 2
## Michelle
## 3
## Misery
## 1
## Money (That's What I Want)
## 1
## Moonlight Bay
## 2
## Mother Nature's Son
## 1
## Mr. Moonlight
## 1
## My Bonnie
## 1
## No Reply
## 1
## Norwegian Wood (This Bird Has Flown)
## 3
## Not a Second Time
## 2
## Not Guilty
## 2
## Nothin' Shakin' (But the Leaves on the Trees)
## 2
## Nowhere Man
## 3
## Ob-La-Di, Ob-La-Da
## 3
## Octopus's Garden
## 1
## Oh! Darling
## 3
## Old Brown Shoe
## 1
## One After 909
## 1
## One and One Is Two
## 2
## Only a Northern Song
## 1
## Ooh! My Soul
## 2
## P.S. I Love You
## 1
## Paperback Writer
## 3
## Penny Lane
## 3
## Piggies
## 2
## Please Mr. Postman
## 1
## Please Please Me
## 3
## Polythene Pam
## 2
## Rain
## 3
## Real Love
## 2
## Revolution 1
## 1
## Revolution 9
## 1
## Revolution
## 3
## Rip It Up/Shake, Rattle, and Roll/Blue Suede Shoes
## 2
## Rock and Roll Music
## 1
## Rocky Raccoon
## 1
## Roll Over Beethoven
## 1
## Run for Your Life
## 1
## Savoy Truffle
## 2
## Searchin'
## 2
## September in the Rain
## 2
## Sexy Sadie
## 1
## Sgt. Pepper's Lonely Hearts Club Band (Reprise)
## 2
## Sgt. Pepper's Lonely Hearts Club Band
## 3
## Shakin' in the Sixties
## 2
## She Came in Through the Bathroom Window
## 2
## She Loves You
## 3
## She Said She Said
## 1
## She's a Woman
## 1
## She's Leaving Home
## 3
## Shout
## 2
## Sie Liebt Dich
## 2
## Slow Down
## 1
## So How Come (No One Loves Me)
## 2
## Soldier of Love (Lay Down Your Arms)
## 2
## Some Other Guy
## 2
## Something
## 3
## Sour Milk Sea
## 2
## Step Inside Love/Los Paranoias
## 2
## Strawberry Fields Forever
## 3
## Sun King
## 2
## Sure to Fall (In Love with You)
## 2
## Sweet Little Sixteen
## 2
## Take Good Care of My Baby
## 2
## Taking a Trip to Carolina
## 2
## Taxman
## 3
## Teddy Boy
## 2
## Tell Me What You See
## 1
## Tell Me Why
## 1
## Thank You Girl
## 1
## That Means a Lot
## 2
## That'll Be the Day
## 2
## That's All Right (Mama)
## 2
## The Ballad of John and Yoko
## 1
## The Continuing Story of Bungalow Bill
## 1
## The End
## 2
## The Fool on the Hill
## 3
## The Honeymoon Song
## 2
## The Inner Light
## 1
## The Long and Winding Road
## 3
## The Night Before
## 1
## The Saints
## 2
## The Sheik of Araby
## 2
## The Word
## 2
## There's a Place
## 1
## Things We Said Today
## 1
## Think for Yourself
## 2
## This Boy
## 1
## Three Cool Cats
## 2
## Ticket to Ride
## 3
## Till There Was You
## 1
## Tip of My Tongue
## 2
## To Know Her is to Love Her
## 2
## Tomorrow Never Knows
## 3
## Too Much Monkey Business
## 2
## Twist and Shout
## 1
## Two of Us
## 1
## Wait
## 2
## Watching Rainbows
## 2
## We Can Work It Out
## 3
## What Goes On
## 1
## What You're Doing
## 2
## What's The New Mary Jane
## 2
## When I Get Home
## 1
## When I'm Sixty-Four
## 3
## While My Guitar Gently Weeps
## 3
## Why Don't We Do It in the Road?
## 1
## Wild Honey Pie
## 2
## Winston's Walk
## 2
## With a Little Help from My Friends
## 3
## Within You Without You
## 1
## Woman
## 2
## Words of Love
## 1
## Yellow Submarine
## 3
## Yer Blues
## 1
## Yes It Is
## 1
## Yesterday
## 3
## You Can't Do That
## 1
## You Know My Name (Look Up the Number)
## 2
## You Know What to Do
## 2
## You Like Me Too Much
## 1
## You Never Give Me Your Money
## 1
## You Won't See Me
## 1
## You'll Be Mine
## 2
## You're Going to Lose That Girl
## 2
## You've Got to Hide Your Love Away
## 3
## You've Really Got a Hold on Me
## 2
## Young Blood
## 2
## Your Mother Should Know
## 1
##
## Within cluster sum of squares by cluster:
## [1] 16.019939 9.727616 20.179295
## (between_SS / total_SS = 54.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Run K-Means also for K = 4, since K = 4 seems to be the next best value for K:
set.seed(888)
clusters.K4.all.vars <- kmeans(x = the.beatles.songs.num.4, centers = 4, iter.max = 20, nstart = 1000)
clusters.K4.all.vars
## K-means clustering with 4 clusters of sizes 31, 99, 46, 134
##
## Cluster means:
## Duration Other.releases Covered.by Top.50.Billboard
## 1 0.5284278 0.85169738 0.77096774 0.684516129
## 2 0.4715880 0.48592306 0.16464646 0.016767677
## 3 0.5805490 0.40742190 0.81086957 0.038695652
## 4 0.3690436 0.06668784 0.03731343 0.003880597
##
## Clustering vector:
## 12-Bar Original
## 4
## A Day in the Life
## 3
## A Hard Day's Night
## 1
## A Shot of Rhythm and Blues
## 4
## A Taste of Honey
## 2
## Across the Universe
## 3
## Act Naturally
## 2
## Ain't She Sweet
## 4
## All I've Got to Do
## 4
## All My Loving
## 3
## All Things Must Pass
## 4
## All Together Now
## 4
## All You Need Is Love
## 1
## And I Love Her
## 1
## And Your Bird Can Sing
## 2
## Anna (Go to Him)
## 2
## Another Girl
## 4
## Any Time at All
## 2
## Ask Me Why
## 2
## Baby It's You
## 2
## Baby's in Black
## 2
## Baby, You're a Rich Man
## 2
## Back in the U.S.S.R.
## 3
## Bad Boy
## 4
## Bad to Me
## 4
## Beautiful Dreamer
## 4
## Because I Know You Love Me So
## 4
## Because
## 3
## Being for the Benefit of Mr. Kite!
## 3
## Birthday
## 2
## Blackbird
## 3
## Blue Jay Way
## 2
## Boys
## 2
## Bésame Mucho
## 4
## Can't Buy Me Love
## 1
## Carol
## 4
## Carry That Weight
## 4
## Catswalk
## 4
## Cayenne
## 4
## Chains
## 2
## Child of Nature
## 4
## Christmas Time (Is Here Again)
## 4
## Circles
## 4
## Clarabella
## 4
## Come and Get It
## 4
## Come Together
## 1
## Cry Baby Cry
## 2
## Cry for a Shadow
## 4
## Crying, Waiting, Hoping
## 4
## Day Tripper
## 1
## Dear Prudence
## 3
## Devil in Her Heart
## 2
## Dig a Pony
## 2
## Dig It
## 4
## Dizzy, Miss Lizzy
## 2
## Do You Want to Know a Secret?
## 1
## Doctor Robert
## 4
## Don't Bother Me
## 2
## Don't Ever Change
## 4
## Don't Let Me Down
## 3
## Don't Pass Me By
## 2
## Drive My Car
## 3
## Eight Days a Week
## 1
## Eleanor Rigby
## 3
## Etcetera
## 4
## Every Little Thing
## 4
## Everybody's Got Something to Hide Except Me and My Monkey
## 2
## Everybody's Trying to Be My Baby
## 2
## Fancy My Chances with You
## 4
## Fixing a Hole
## 4
## Flying
## 2
## For No One
## 2
## For You Blue
## 2
## Free as a Bird
## 4
## From Me to You
## 2
## From Us to You
## 4
## Get Back
## 1
## Getting Better
## 3
## Girl
## 3
## Glad All Over
## 4
## Glass Onion
## 4
## Golden Slumbers
## 3
## Good Day Sunshine
## 4
## Good Morning, Good Morning
## 2
## Good Night
## 2
## Goodbye
## 4
## Got to Get You into My Life
## 3
## Hallelujah, I Love Her So
## 4
## Happiness Is a Warm Gun
## 3
## Heather
## 4
## Hello Little Girl
## 4
## Hello, Goodbye
## 1
## Help!
## 1
## Helter Skelter
## 3
## Her Majesty
## 4
## Here Comes the Sun
## 3
## Here, There and Everywhere
## 3
## Hey Bulldog
## 3
## Hey Jude
## 1
## Hippy Hippy Shake
## 4
## Hold Me Tight
## 2
## Honey Don't
## 2
## Honey Pie
## 2
## How Do You Do It?
## 4
## I Am the Walrus
## 3
## I Call Your Name
## 2
## I Don't Want to Spoil the Party
## 2
## I Feel Fine
## 1
## I Forgot to Remember to Forget
## 4
## I Got a Woman
## 4
## I Got to Find My Baby
## 4
## I Just Don't Understand
## 4
## I Lost My Little Girl
## 4
## I Me Mine
## 2
## I Need You
## 2
## I Saw Her Standing There
## 1
## I Should Have Known Better
## 2
## I Wanna Be Your Man
## 2
## I Want to Hold Your Hand
## 1
## I Want to Tell You
## 4
## I Want You (She's So Heavy)
## 3
## I Will
## 3
## I'll Be Back
## 2
## I'll Be on My Way
## 4
## I'll Cry Instead
## 2
## I'll Follow the Sun
## 2
## I'll Get You
## 2
## I'll Keep You Satisfied
## 4
## I'm a Loser
## 2
## I'm Down
## 2
## I'm Gonna Sit Right Down and Cry (Over You)
## 4
## I'm Happy Just to Dance with You
## 4
## I'm In Love
## 4
## I'm Looking Through You
## 2
## I'm Only Sleeping
## 2
## I'm So Tired
## 2
## I'm Talking About You (Star Club)
## 4
## I'm Talking About You (BBC)
## 4
## I've Got a Feeling
## 2
## I've Just Seen a Face
## 3
## If I Fell
## 3
## If I Needed Someone
## 2
## If You've Got Trouble
## 4
## In My Life
## 3
## In Spite of All the Danger
## 4
## It Won't Be Long
## 2
## It's All Too Much
## 2
## It's Only Love
## 2
## Jazz Piano Song
## 4
## Jessie's Dream
## 4
## Johnny B. Goode
## 4
## Julia
## 2
## Junk
## 4
## Kansas City/Hey, Hey, Hey, Hey
## 4
## Keep Your Hands Off My Baby
## 4
## Komm Gib Mir Deine Hand
## 4
## Lady Madonna
## 1
## Leave My Kitten Alone
## 4
## Lend Me Your Comb
## 4
## Let It Be
## 1
## Like Dreamers Do
## 4
## Little Child
## 4
## Lonesome Tears in My Eyes
## 4
## Long Tall Sally
## 2
## Long, Long, Long
## 4
## Looking Glass
## 4
## Love Me Do
## 1
## Love of the Loved
## 4
## Love You To
## 2
## Lovely Rita
## 2
## Lucille
## 4
## Lucy in the Sky with Diamonds
## 3
## Madman
## 4
## Maggie Mae
## 4
## Magical Mystery Tour
## 2
## Mailman, Bring Me No More Blues
## 4
## Martha My Dear
## 4
## Matchbox
## 2
## Maxwell's Silver Hammer
## 2
## Mean Mr. Mustard
## 4
## Memphis, Tennessee
## 4
## Michelle
## 3
## Misery
## 2
## Money (That's What I Want)
## 2
## Moonlight Bay
## 4
## Mother Nature's Son
## 3
## Mr. Moonlight
## 2
## My Bonnie
## 2
## No Reply
## 2
## Norwegian Wood (This Bird Has Flown)
## 3
## Not a Second Time
## 4
## Not Guilty
## 4
## Nothin' Shakin' (But the Leaves on the Trees)
## 4
## Nowhere Man
## 1
## Ob-La-Di, Ob-La-Da
## 3
## Octopus's Garden
## 2
## Oh! Darling
## 3
## Old Brown Shoe
## 2
## One After 909
## 2
## One and One Is Two
## 4
## Only a Northern Song
## 2
## Ooh! My Soul
## 4
## P.S. I Love You
## 2
## Paperback Writer
## 1
## Penny Lane
## 1
## Piggies
## 4
## Please Mr. Postman
## 2
## Please Please Me
## 1
## Polythene Pam
## 4
## Rain
## 3
## Real Love
## 4
## Revolution 1
## 2
## Revolution 9
## 2
## Revolution
## 1
## Rip It Up/Shake, Rattle, and Roll/Blue Suede Shoes
## 4
## Rock and Roll Music
## 2
## Rocky Raccoon
## 2
## Roll Over Beethoven
## 2
## Run for Your Life
## 2
## Savoy Truffle
## 2
## Searchin'
## 4
## September in the Rain
## 4
## Sexy Sadie
## 2
## Sgt. Pepper's Lonely Hearts Club Band (Reprise)
## 4
## Sgt. Pepper's Lonely Hearts Club Band
## 3
## Shakin' in the Sixties
## 4
## She Came in Through the Bathroom Window
## 4
## She Loves You
## 1
## She Said She Said
## 2
## She's a Woman
## 1
## She's Leaving Home
## 3
## Shout
## 4
## Sie Liebt Dich
## 4
## Slow Down
## 2
## So How Come (No One Loves Me)
## 4
## Soldier of Love (Lay Down Your Arms)
## 4
## Some Other Guy
## 4
## Something
## 3
## Sour Milk Sea
## 4
## Step Inside Love/Los Paranoias
## 4
## Strawberry Fields Forever
## 3
## Sun King
## 4
## Sure to Fall (In Love with You)
## 4
## Sweet Little Sixteen
## 4
## Take Good Care of My Baby
## 4
## Taking a Trip to Carolina
## 4
## Taxman
## 3
## Teddy Boy
## 4
## Tell Me What You See
## 2
## Tell Me Why
## 2
## Thank You Girl
## 2
## That Means a Lot
## 4
## That'll Be the Day
## 4
## That's All Right (Mama)
## 4
## The Ballad of John and Yoko
## 2
## The Continuing Story of Bungalow Bill
## 2
## The End
## 4
## The Fool on the Hill
## 3
## The Honeymoon Song
## 4
## The Inner Light
## 2
## The Long and Winding Road
## 1
## The Night Before
## 2
## The Saints
## 4
## The Sheik of Araby
## 4
## The Word
## 4
## There's a Place
## 2
## Things We Said Today
## 2
## Think for Yourself
## 4
## This Boy
## 2
## Three Cool Cats
## 4
## Ticket to Ride
## 1
## Till There Was You
## 2
## Tip of My Tongue
## 4
## To Know Her is to Love Her
## 4
## Tomorrow Never Knows
## 3
## Too Much Monkey Business
## 4
## Twist and Shout
## 1
## Two of Us
## 2
## Wait
## 4
## Watching Rainbows
## 4
## We Can Work It Out
## 1
## What Goes On
## 2
## What You're Doing
## 4
## What's The New Mary Jane
## 4
## When I Get Home
## 2
## When I'm Sixty-Four
## 3
## While My Guitar Gently Weeps
## 3
## Why Don't We Do It in the Road?
## 4
## Wild Honey Pie
## 4
## Winston's Walk
## 4
## With a Little Help from My Friends
## 3
## Within You Without You
## 3
## Woman
## 4
## Words of Love
## 2
## Yellow Submarine
## 1
## Yer Blues
## 2
## Yes It Is
## 2
## Yesterday
## 1
## You Can't Do That
## 2
## You Know My Name (Look Up the Number)
## 4
## You Know What to Do
## 4
## You Like Me Too Much
## 2
## You Never Give Me Your Money
## 2
## You Won't See Me
## 3
## You'll Be Mine
## 4
## You're Going to Lose That Girl
## 4
## You've Got to Hide Your Love Away
## 3
## You've Really Got a Hold on Me
## 4
## Young Blood
## 4
## Your Mother Should Know
## 2
##
## Within cluster sum of squares by cluster:
## [1] 7.051222 11.577611 7.337453 9.803310
## (between_SS / total_SS = 64.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Examine and compare the cluster centers for K = 3 and K = 4:
clusters.K3.all.vars.stats <- summarizeClusterStats(feature.set = the.beatles.songs.num,
clusters = clusters.K3.all.vars$cluster, cl.num = 3)
clusters.K3.all.vars.stats
## Cluster 1 Cluster 2 Cluster 3
## Frequency 112 135 63
## Duration (mean, sd) 158.69, 33.04 145.21, 33.22 173.54, 39.9
## Other.releases (mean, sd) 14.89, 6.42 2.07, 3.34 18.63, 8.38
## Covered.by (mean, sd) 4.09, 3.56 0.7, 1.51 17.41, 3.88
## Top.50.Billboard (mean, sd) 1.75, 6.12 0.19, 1.38 16.46, 18.34
clusters.K4.all.vars.stats <- summarizeClusterStats(feature.set = the.beatles.songs.num,
clusters = clusters.K4.all.vars$cluster, cl.num = 4)
clusters.K4.all.vars.stats
## Cluster 1 Cluster 2 Cluster 3
## Frequency 31 99 46
## Duration (mean, sd) 167.16, 37.47 159.04, 32.44 174.6, 40.04
## Other.releases (mean, sd) 26.02, 4.93 14.84, 6.15 12.45, 5.42
## Covered.by (mean, sd) 15.42, 6.25 3.29, 2.95 16.22, 3.92
## Top.50.Billboard (mean, sd) 34.23, 10.28 0.84, 3.28 1.93, 6.01
## Cluster 4
## Frequency 134
## Duration (mean, sd) 144.41, 33.31
## Other.releases (mean, sd) 2.04, 3.33
## Covered.by (mean, sd) 0.75, 1.63
## Top.50.Billboard (mean, sd) 0.19, 1.38
# install.packages("fpc")
library(fpc)
?cluster.stats
<comparison criteria> <- # specify criteria (from cluster.stats()) for comparing
+ c("<criterion 1>", # different clusterings (e.g., "max.diameter", "min.separation",
+ "<criterion 2>", ...) # "average.between", "average.within", "within.cluster.ss", ...)
<distance matrix> <-
+ dist(x = <normalized dataset>)
<comparison> <- sapply(list(<clustering 1 name> = <clustering 1>$cluster, # <clustering 1> computed by kmeans()
+ <clustering 2 name> = <clustering 2>$cluster, # <clustering 2> computed by kmeans()
+ ...)
+ FUN = function(x)
+ cluster.stats(<distance matrix>, x))[<comparison criteria>, ]
Alternative 1 - show output as a table in the console:
# install.packages("knitr")
library(knitr)
kable(x = <comparison>, format = "rst")
Alternative 2 - show output as a dataframe, using an appropriate utility function:
source("Summary statistics about clusters.R")
<comparison.df> <-
+ compareMultipleClusterings(<comparison>) # show comparison as a dataframe, using a corresponding utility function
<comparison.df>
library(fpc)
comparison.criteria <- c("max.diameter", "min.separation", "average.between",
"average.within", "within.cluster.ss")
d <- dist(x = the.beatles.songs.num.4)
comparison <- sapply(list(c.K3.var2 = clusters.K3$cluster, # clustering: 3 clusters, 2 variables
c.K4.var2 = clusters.K4$cluster, # clustering: 4 clusters, 2 variables
c.K3.var4 = clusters.K3.all.vars$cluster, # 3 clusters, 4 variables
c.K4.var4 = clusters.K4.all.vars$cluster), # 4 clusters, 4 variables
FUN = function(x) cluster.stats(d, x))[comparison.criteria, ]
# Alternative 1 - show output as a table in the console:
# library(knitr)
# kable(x = comparison, format = "rst")
# Alternative 2 - show output as a dataframe, using an appropriate utility function:
comparison.df <- compareMultipleClusterings(comparison)
comparison.df
## c.K3.var2 c.K4.var2 c.K3.var4 c.K4.var4
## max.diameter 1.61456014 1.61456014 1.61456014 1.34330166
## min.separation 0.03889892 0.05327394 0.07306239 0.07306239
## average.between 0.86906146 0.82500969 0.86156342 0.85727127
## average.within 0.43265401 0.40330399 0.42781533 0.38151663
## within.cluster.ss 47.00207948 41.22975881 45.92684939 35.76959542