Based on a dataset provide by HAR http://web.archive.org/web/20161224072740/http:/groupware.les.inf.puc-rio.br/har we will try to train a predictive model to predict what type of exercise is performed based on data collected from wearable sensors.
We’ll take the following steps:
library(caret) # For machine learning workflow
library(randomForest) # For Random Forest model
library(ggplot2) # For visualization
library(corrplot) # For correlation analysis
library(dplyr) # For data manipulation
library(doParallel)
library(formattable)
library(caretEnsemble)
library(xgboost)
library(gbm)
library(ranger)
library(glmnet)
library(knitr)
set.seed(123) # Ensure reproducibility
Download files if they are not yet on the system.
downloadFile <- function(fileName) {
destfile <- paste0("pml-",fileName, ".csv")
if (!file.exists(destfile)) {
download.file(
paste0("https://d396qusza40orc.cloudfront.net/predmachlearn/", fileName, ".csv"),
destfile = destfile,
mode = "wb"
)
}
}
downloadFile("training")
downloadFile("testing")
We will remove columns that are either empty or unsuitable for making predictions.
cleanData <- function(fileName) {
rawdata <- read.csv(paste0("pml-",fileName,".csv"))
# Remove reduntant columns
# Identifiers & Timestamps:
#Remove columns that are time-based, likely irrelevant
rawdata <- rawdata[,-c(grep("timestamp", names(rawdata)))]
# "X" (index column)
# "user_name" (not useful for prediction)
rawdata <- rawdata[, !(names(rawdata) %in% c("X","user_name"))]
#"new_window" and "num_window" (used for session tracking)
rawdata <- rawdata[, !(names(rawdata) %in% c("new_window","num_window"))]
# Remove columns where many values are NA
# Define the threshold (90% missing values)
threshold <- 0.9 * nrow(rawdata)
rawdata <- rawdata[, colSums(is.na(rawdata) | rawdata == "") < threshold]
#Set factor to "classe"
rawdata$classe <- as.factor(rawdata$classe)
return(rawdata)
}
cleaned_data <- cleanData("training")
We have only 52 features left.
missing_values_count <- sum(is.na(cleaned_data))
We have 0 missing values.
nzv_vars <- nearZeroVar(cleaned_data, saveMetrics = TRUE)
count_nzv <- sum(nzv_vars$nzv == TRUE)
We have 0 near zero values.
trainIndex <- createDataPartition(cleaned_data$classe, p = 0.7, list = FALSE)
traindata <- cleaned_data[trainIndex, ]
validation_data <- cleaned_data[-trainIndex, ]
Some processes are really time consuming, so let’s prepare parallel processing
# Corrected global cluster variable
parallelCluster <- NULL
# Function to stop the parallel cluster
stopParallelCluster <- function() {
if (!is.null(parallelCluster)) {
stopCluster(parallelCluster) # Stop the parallel cluster
registerDoSEQ() # Reset to sequential execution
parallelCluster <<- NULL # Clear the global cluster variable
}
}
# Function to prepare the parallel cluster
prepareParallelCluster <- function() {
# Detect the number of available cores and use one less
num_cores <- max(detectCores() - 1, 1) # Ensure at least 1 core
# Create and register the cluster globally
parallelCluster <<- makeCluster(num_cores)
registerDoParallel(parallelCluster)
}
# Function to load saved R objects from a .RData file if the file exists
loadObjects <- function(fileName) {
# Append ".RData" extension to the filename
fileName <- paste0(fileName, ".RData")
# Check if the file exists before attempting to load
if (file.exists(fileName)) {
# Load objects into the global environment
load(fileName, envir = .GlobalEnv)
}
}
Define a train_control object with cross-validation and set up your base models.
ranger
captures non-linear
relationships well with randomness.glmnet
covers linear relationships
effectively and handles regularization.xgbTree
boosts weak learners by
focusing on correcting previous mistakes.ranger
and
xgbTree
are low-bias, high-variance models
(good for capturing complexity).glmnet
is a high-bias, low-variance
model (good for simplicity and regularization).glmnet
with tree-based models
improves overall performance.MODEL_LIST_FILE <- "model_list"
loadObjects(MODEL_LIST_FILE)
if (!exists("model_list") || is.null(model_list)) {
start_time <- proc.time()
prepareParallelCluster()
# Define training control with 5-fold cross-validation
train_control <- trainControl(
method = "cv",
number = 5,
savePredictions = "final",
allowParallel = TRUE
)
# List of base models
model_list <- caretList(
classe ~ ., # Predicting Exercise Error
data = traindata,
trControl = train_control,
methodList = c("ranger", "glmnet", "xgbTree")
)
stopParallelCluster()
execution_time <- proc.time() - start_time
save(model_list, execution_time, file = paste0(MODEL_LIST_FILE,".RData"))
}
print(paste("Elapsed time:", round((execution_time["elapsed"] / 60), 2), "minutes"))
## [1] "Elapsed time: 17.83 minutes"
Combine the base models using a meta-model (xgbTree):
STACK_MODEL_FILE <- "stacked_model"
loadObjects(STACK_MODEL_FILE)
if (!exists("stacked_model") || is.null(stacked_model)) {
start_time <- proc.time()
prepareParallelCluster()
# Stack the models using a generalized linear model (GLM) as meta-learner
stacked_model <- caretStack(
model_list,
method = "xgbTree",
metric = "Accuracy",
trControl = trainControl(
method = "cv",
number = 5,
savePredictions = "final",
classProbs = TRUE, # Enable class probabilities for classification
allowParallel = FALSE,
sampling = "up" # Upsample minority classes to avoid empty folds
)
)
stopParallelCluster()
execution_time <- proc.time() - start_time
save(stacked_model, execution_time, file = paste0(STACK_MODEL_FILE,".RData"))
}
print(paste("Elapsed time:", round((execution_time["elapsed"] / 60), 2), "minutes"))
## [1] "Elapsed time: 12.95 minutes"
# Performance of the stacked model
plot(stacked_model)
# Extract the best performance metrics from the stacked_model model
best_results <- stacked_model$ens_model$results[
which.max(stacked_model$ens_model$results$Accuracy),
]
# Extract accuracy and kappa values
best_accuracy <- round(best_results$Accuracy * 100, 2) # Convert to percentage
best_kappa <- round(best_results$Kappa, 4)
Excellent Performance: The model achieved high accuracy 99.48% and Kappa score 0.9934, indicating strong predictive ability with balanced classification.
# Make predictions on validation data
validation_predictions <- predict(stacked_model, newdata = validation_data)
# Convert probabilities to predicted classes
predicted_classes <- factor(
colnames(validation_predictions)[max.col(validation_predictions, ties.method = "first")],
levels = levels(validation_data$classe)
)
# Generate confusion matrix for validation data
conf_matrix <- confusionMatrix(predicted_classes, validation_data$classe)
# Extract metrics for inline display
accuracy <- conf_matrix$overall['Accuracy']
# Calculate out-of-sample error
oos_error <- 1 - accuracy
ci_lower <- conf_matrix$overall['AccuracyLower']
ci_upper <- conf_matrix$overall['AccuracyUpper']
p_value <- format.pval(conf_matrix$overall['AccuracyPValue'], digits = 4)
The out-of-sample error estimates how well the model is expected to perform on new, unseen data. A low out-of-sample error indicates that the model generalizes well and is not overfitting the training data.
Given the complexity of the sensor data and the nature of the classification task, we expect the out-of-sample error to be relatively low (ideally below 5%) for a well-tuned model.
The received value 0.0034 (0.34%) suggests that the model is not overfitting and is likely to perform well on new data.
In this analysis, we use cross-validation combined with a confusion matrix to estimate the out-of-sample error.
# Load pml-testing.csv
pml_testing <- read.csv("pml-testing.csv")
# Make predictions using the trained model
test_predictions <- predict(stacked_model, pml_testing)
vector_predicted <- colnames(test_predictions)[max.col(test_predictions, ties.method = "first")]
# Display predictions nicely using kable
kable(rbind(
1:length(vector_predicted),
"Predicted Classe" = vector_predicted
))
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | |
Predicted Classe | B | A | B | A | A | E | D | B | A | A | B | C | B | A | E | E | A | B | B | B |
All predcited values were confirmed to be correct 100%.