get_evaluation_measures <- function(model, test_data) {
predicted_vals <- predict(model,
test_data |> select(-course_outcome),
type = 'class')
actual_vals <- test_data$course_outcome
cm <- table(actual_vals, predicted_vals)
# low achievement in the course is considered the positive class
TP <- cm[2,2]
TN <- cm[1,1]
FP <- cm[1,2]
FN <- cm[2,1]
accuracy = sum(diag(cm)) / sum(cm)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
F1 <- (2 * precision * recall) / (precision + recall)
c(Accuracy = accuracy,
Precision = precision,
Recall = recall,
F1 = F1)
}
models <- list()
eval_measures <- list()
for(k in 1:5) {
print(paste("Starting computations for week", k))
week_k_events <- events |> filter(current_week <= k)
ds <- create_feature_set(week_k_events)
ds <- inner_join(ds, grades)
set.seed(2023)
train_indices <- createDataPartition(ds$course_outcome,
p = 0.8, list = FALSE)
train_ds <- ds[train_indices,] |> select(-c(user, grade))
test_ds <- ds[-train_indices,] |> select(-c(user, grade))
dt <- build_DT_model(train_ds)
eval_dt <- get_evaluation_measures(dt, test_ds)
models[[k]] <- dt
eval_measures[[k]] <- eval_dt
}
# transform the eval_measures list into a df
eval_df <- bind_rows(eval_measures)
# embellish the evaluation report by:
# 1) adding the week column;
# 2) rounding the metric values to 4 digits;
# 3) rearanging the order of columns
eval_df |>
mutate(week = 1:5) |>
mutate(across(Accuracy:F1, round, digits=4)) |>
select(week, Accuracy:F1)
models[[2]]$variable.importance |>
as.data.frame() |>
rename(importance = `models[[2]]$variable.importance`)
events <- read.csv("data/events.csv")
str(events)
events$ts <- as.POSIXct(events$ts)
events$ts[1:10]
class(events$ts)
events |> arrange(user, ts) |> head(20)
library(ggplot2)
library(dplyr)
library(tidyr)
events |> arrange(user, ts) |> head(20)
events |> arrange(user, ts) -> events
events |> head(20)
# start of the course
start_ts <- min(events$ts)
start_ts
# end of the course
end_ts <- max(events$ts)
ends_ts
end_ts
?difftime
difftime(end_ts, start_ts, units="weeks")
strftime(events$ts[1:10], format = "%V")
?strftime
strftime(events$ts[1:10], format = "%a")
strftime(events$ts[1:10], format = "%b")
strftime(events$ts[1:10], format = "%V")
strftime(events$ts[1:10], format = "%V") |> as.integer()
events |>
mutate(week = strftime(ts, format = "%V") |> as.integer()) |>
head()
mutate(week = strftime(ts, format = "%V") |> as.integer()) |>
mutate(course_week = week - min(week)) |> head()
events |>
mutate(week = strftime(ts, format = "%V") |> as.integer()) |>
mutate(course_week = week - min(week)) |> head()
events |>
mutate(week = strftime(ts, format = "%V") |> as.integer()) |>
mutate(course_week = week - min(week) + 1) |>
select(-week) -> events
table(events$course_week)
table(events$course_week) |> prop.table()
table(events$course_week) |> prop.table() |> round(digits = 4)
unique(events$action)
#unique(events$action)
unique(events$log)
table(events$action) |> prop.table() |> round(4)
events |>
mutate(action = ifelse(action %in% course_topics,
paste0("Lecture_", action),
action)) -> events
course_topics <- c("General", "Applications", "Theory",  "Ethics", "Feedback", "La_types")
events |>
mutate(action = ifelse(action %in% course_topics,
paste0("Lecture_", action),
action)) -> events
table(events$action) |> prop.table() |> round(4)
grades <- read.csv("data/grades.csv")
str(grades)
summary(grades)
summary(grades$grade)
ggplot(grades, aes(x=grade)) +
geom_density() +
theme_minimal()
ggplot(grades, aes(x=grade)) +
geom_density() +
scale_x_continuous(breaks = 1:10) +
theme_minimal()
sum(is.na(grades$grade))
grades |>
mutate(course_outcome = ifelse(grade > median(grade), "HIGH", "LOW"),
course_outcome = as.factor(course_outcome)) -> grades
table(grades$course_outcome)
events$date <- as.Date(events$ts)
head(events, 20)
actions_tot_count <- function(events_data) {
events_data |>
mutate(action = ifelse(startsWith(action, "Lecture"), "Lecture", action)) |>
pull(action) |> table()
}
w2_data <- events |> filter(week <= 2)
w2_data <- events |> filter(course_week <= 2)
actions_tot_count <- function(events_data) {
events_data |>
mutate(action = ifelse(startsWith(action, "Lecture"), "Lecture", action)) |>
pull(action) |>
table()
}
w2_data <- events |> filter(course_week <= 2)
actions_tot_count(w2_data)
actions_tot_count <- function(events_data) {
events_data |>
mutate(action = ifelse(startsWith(action, "Lecture"), "Lecture", action)) |>
group_by(user, action) |>
count()
}
w2_data <- events |> filter(course_week <= 2)
actions_tot_count(w2_data)
actions_tot_count <- function(events_data) {
events_data |>
mutate(action = ifelse(startsWith(action, "Lecture"), "Lecture", action)) |>
group_by(user, action) |>
count() |>
pivot_wider(id_cols = user,
names_from = action,
values_from = n)
}
w2_data <- events |> filter(course_week <= 2)
actions_tot_count(w2_data)
actions_tot_count <- function(events_data) {
events_data |>
mutate(action = ifelse(startsWith(action, "Lecture"), "Lecture", action)) |>
group_by(user, action) |>
count() |>
pivot_wider(id_cols = user,
names_from = action,
names_prefix = "Cnt_"
values_from = n,
actions_tot_count <- function(events_data) {
events_data |>
mutate(action = ifelse(startsWith(action, "Lecture"), "Lecture", action)) |>
group_by(user, action) |>
count() |>
pivot_wider(id_cols = user,
names_from = action,
names_prefix = "Cnt_",
values_from = n,
values_fill = 0)
}
w2_data <- events |> filter(course_week <= 2)
actions_tot_count(w2_data)
avg_actions_per_day = function(events_data) {
events_data |>
group_by(user, date) |>
count()
}
avg_actions_per_day(w2_data)
avg_actions_per_day = function(events_data) {
events_data |>
count(user, date)
}
avg_actions_per_day(w2_data)
avg_actions_per_day = function(events_data) {
events_data |>
count(user, date) |>
group_by(user) |>
summarise(avg_action_cnt = median(n))
}
avg_actions_per_day(w2_data)
entropy_of_action_counts = function(events_data) {
events_data |>
group_by(user) |>
mutate(tot_action_cnt = n())
}
entropy_of_action_counts(w2_data)
entropy_of_action_counts = function(events_data) {
events_data |>
group_by(user) |>
mutate(tot_action_cnt = n())|>
group_by(user, date) |>
summarise(daily_action_cnt = n(),
daily_action_prop = daily_action_cnt/tot_action_cnt)
}
entropy_of_action_counts(w2_data)
entropy_of_action_counts = function(events_data) {
events_data |>
group_by(user) |>
mutate(tot_action_cnt = n())|>
group_by(user, date) |>
summarise(daily_action_cnt = n(),
daily_action_prop = daily_action_cnt/tot_action_cnt) |>
distinct()
}
entropy_of_action_counts(w2_data)
entropy_of_action_counts = function(events_data) {
events_data |>
group_by(user) |>
mutate(tot_action_cnt = n())|>
group_by(user, date) |>
summarise(daily_action_cnt = n(),
daily_action_prop = daily_action_cnt/tot_action_cnt) |>
distinct() |>
group_by(user) |>
summarise(action_cnt_entropy = -sum(daily_action_prop * log(daily_action_prop)))
}
entropy_of_action_counts(w2_data)
active_days_count = function(events_data) {
events_data |>
group_by(user) |>
summarise(aday_cnt = n_distinct(date))
}
active_days_count(w2_data)
avg_dist_active_days = function(events_data) {
events_data |>
distinct(user, date)
}
avg_dist_active_days(2w_data)
avg_dist_active_days(w2_data)
avg_dist_active_days = function(events_data) {
events_data |>
distinct(user, date) |>
arrange(user, date) |>
mutate(prev_aday = lag(date))
}
avg_dist_active_days(w2_data)
avg_dist_active_days = function(events_data) {
events_data |>
distinct(user, date) |>
group_by(user) |>
arrange(date) |>
mutate(prev_aday = lag(date))
}
avg_dist_active_days(w2_data)
avg_dist_active_days = function(events_data) {
events_data |>
distinct(user, date) |>
group_by(user) |>
arrange(date)
}
avg_dist_active_days = function(events_data) {
events_data |>
distinct(user, date) |>
group_by(user) |>
arrange(date)
}
avg_dist_active_days(w2_data)
avg_dist_active_days = function(events_data) {
events_data |>
distinct(user, date) |>
arrange(user, date) |>
group_by(user) |>
mutate(prev_aday = lag(date))
}
avg_dist_active_days(w2_data)
avg_dist_active_days = function(events_data) {
events_data |>
distinct(user, date) |>
arrange(user, date) |>
group_by(user) |>
mutate(prev_aday = lag(date)) |>
mutate(aday_dist = ifelse(is.na(prev_aday),
NA,
difftime(date, prev_aday, units = "days")))
}
avg_dist_active_days(w2_data)
avg_dist_active_days = function(events_data) {
events_data |>
distinct(user, date) |>
arrange(user, date) |>
group_by(user) |>
mutate(prev_aday = lag(date)) |>
mutate(aday_dist = ifelse(is.na(prev_aday),
NA,
difftime(date, prev_aday, units = "days"))) |>
summarise(avg_aday_dist = median(aday_dist, na.rm = TRUE))
}
avg_dist_active_days(w2_data)
avg_dist_active_days(w2_data) |> summary()
avg_dist_active_days = function(events_data) {
events_data |>
distinct(user, date) |>
arrange(user, date) |>
group_by(user) |>
mutate(prev_aday = lag(date)) |>
mutate(aday_dist = ifelse(is.na(prev_aday),
NA,
difftime(date, prev_aday, units = "days"))) |>
summarise(avg_aday_dist = median(aday_dist, na.rm = TRUE)) ->df
max_dist = max(df$avg_aday_dist,na.rm=T)
df |>
mutate(avg_aday_dist = ifelse(is.na(avg_aday_dist),
2*max_dist,
avg_aday_dist))
}
avg_dist_active_days(w2_data) |> summary()
create_feature_set = function(events_data) {
f1 = actions_tot_count(w2_data)
f2 = avg_actions_per_day(w2_data)
f3 = entropy_of_action_counts(w2_data)
f4 = active_days_count(w2_data)
f5 = avg_dist_active_days(w2_data)
f1 |>
inner_join(f2, by="user") |>
inner_join(f3) |>
inner_join(f4) |>
inner_join(f5)
}
w2_features <- create_feature_set(w2_data)
str(w2_features)
create_feature_set = function(events_data) {
f1 = actions_tot_count(w2_data)
f2 = avg_actions_per_day(w2_data)
f3 = entropy_of_action_counts(w2_data)
f4 = active_days_count(w2_data)
f5 = avg_dist_active_days(w2_data)
f1 |>
inner_join(f2, by="user") |>
inner_join(f3) |>
inner_join(f4) |>
inner_join(f5) |>
as.data.frame()
}
w2_features <- create_feature_set(w2_data)
str(w2_features)
w2_features |>
inner_join(grades) -> w2_data
str(w2_data)
w2_features |>
inner_join(grades) |>
select(-grade) -> w2_data
str(w2_data)
ggplot(w2_data, aes(x=Cnt_Group_Work, fill=course_outcome)) +
geom_density(alpha=0.5) +
theme_minimal()
ggplot(w2_data, aes(x=Cnt_Group_work, fill=course_outcome)) +
geom_density(alpha=0.5) +
theme_minimal()
apply(w2_data |> select(-user),
2,
function(fd) {
ggplot(w2_data,
aes(x=.data[[fd]], fill=course_outcome)) +
geom_density(alpha=0.5) +
theme_minimal()
})
apply(w2_data |> select(-c(user, course_outcome)),
2,
function(fd) {
ggplot(w2_data,
aes(x=.data[[fd]], fill=course_outcome)) +
geom_density(alpha=0.5) +
theme_minimal()
})
lapply(colnames(w2_data |> select(-c(user, course_outcome))),
function(fd) {
ggplot(w2_data,
aes(x=.data[[fd]], fill=course_outcome)) +
geom_density(alpha=0.5) +
theme_minimal()
})
feature_names <- colnames(w2_features |> select(-user))
lapply(feature_names,
function(fn) {
ggplot(w2_data,
aes(x=.data[[fn]], fill=course_outcome)) +
geom_density(alpha=0.5) +
theme_minimal()
})
library(caret)
library(rpart)
library(rpart.plot)
build_DT_model <- function(dataset) {
cp_grid <- expand.grid(.cp = seq(0.001, 0.1, 0.005))
ctrl <- trainControl(method = "CV",
number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary)
dt <- train(x = dataset |> select(-course_outcome),
y = dataset$course_outcome,
method = "rpart",
metric = "ROC",
tuneGrid = cp_grid,
trControl = ctrl)
dt$finalModel
}
get_evaluation_measures <- function(model, test_data) {
predicted_vals <- predict(model,
test_data |> select(-course_outcome),
type = 'class')
actual_vals <- test_data$course_outcome
cm <- table(actual_vals, predicted_vals)
# low achievement in the course is considered the positive class
TP <- cm[2,2]
TN <- cm[1,1]
FP <- cm[1,2]
FN <- cm[2,1]
accuracy = sum(diag(cm)) / sum(cm)
precision <- TP / (TP + FP)
recall <- TP / (TP + FN)
F1 <- (2 * precision * recall) / (precision + recall)
c(Accuracy = accuracy,
Precision = precision,
Recall = recall,
F1 = F1)
}
models <- list()
eval_measures <- list()
for(k in 1:5) {
print(paste("Starting computations for week", k))
# create the dataset (features + outcome variable) for the given number of weeks (k)
wK_data <- events |> filter(course_week <= k)
wK_features <- create_feature_set(wK_data)
wK_features |>
inner_join(grades) |> select(-grade) -> wK_ds
# split the data into train and test sets
set.seed(2023)
train_indices <- createDataPartition(wK_ds$course_outcome, p=0.8, list = FALSE)
train_ds <- wK_ds[train_indices, ]|> select(-user)
test_ds <- wK_ds[-train_indices, ] |> select(-user)
# build the model (through CV) and compute eval.measures
wK_dt <- build_DT_model(train_ds)
wK_eval <- get_evaluation_measures(wK_dt, test_ds)
# add the model and its evaluation measures to the corresponding lists
models[[k]] <- wK_dt
eval_measures[[k]] <- wK_eval
}
create_feature_set = function(events_data) {
f1 = actions_tot_count(events_data)
f2 = avg_actions_per_day(events_data)
f3 = entropy_of_action_counts(events_data)
f4 = active_days_count(events_data)
f5 = avg_dist_active_days(events_data)
f1 |>
inner_join(f2, by="user") |>
inner_join(f3) |>
inner_join(f4) |>
inner_join(f5) |>
as.data.frame()
}
models <- list()
eval_measures <- list()
for(k in 1:5) {
print(paste("Starting computations for week", k))
# create the dataset (features + outcome variable) for the given number of weeks (k)
wK_data <- events |> filter(course_week <= k)
wK_features <- create_feature_set(wK_data)
wK_features |>
inner_join(grades) |> select(-grade) -> wK_ds
# split the data into train and test sets
set.seed(2023)
train_indices <- createDataPartition(wK_ds$course_outcome, p=0.8, list = FALSE)
train_ds <- wK_ds[train_indices, ]|> select(-user)
test_ds <- wK_ds[-train_indices, ] |> select(-user)
# build the model (through CV) and compute eval.measures
wK_dt <- build_DT_model(train_ds)
wK_eval <- get_evaluation_measures(wK_dt, test_ds)
# add the model and its evaluation measures to the corresponding lists
models[[k]] <- wK_dt
eval_measures[[k]] <- wK_eval
}
# transform the eval_measures list into a df
eval_measures[[1]]
# transform the eval_measures list into a df
eval_measures_df <- bind_rows(eval_measures)
eval_measures_df
# embellish the evaluation report by:
# 1) adding the week column;
# 2) rounding the metric values to 4 digits;
# 3) rearranging the order of columns
eval_measures_df |>
mutate(across(.fns = round, digits =4))
eval_measures_df |>
mutate(across(.fns = round, digits =4)) |>
mutate(Week = 1:5) |>
select(Week, Accuracy:F1)
rpart.plot(chosen_dt)
chosen_dt <- models[[2]]
rpart.plot(chosen_dt)
chosen_dt$variable.importance
chosen_dt$variable.importance |> as.data.frame()
chosen_dt$variable.importance |>
as.data.frame() |>
rename(VarImp = `chosen_dt$variable.importance`)
