Physiology data preparation

Author

Ava Hoffman

Published

February 11, 2026

Data for the physiology activity comes from: https://www.physionet.org/content/wearable-exam-stress/1.0.0/

library(tidyverse)
compile_params <- function(str_, stu_) {
  eda <- read_csv(paste0("../GEMs_local/phys_data/", stu_, "/", str_, "/EDA.csv"),
                  col_names = F)
  hr <- read_csv(paste0("../GEMs_local/phys_data/", stu_, "/", str_, "/HR.csv"),
                 col_names = F)
  temp <- read_csv(paste0("../GEMs_local/phys_data/", stu_, "/", str_, "/TEMP.csv"),
                   col_names = F)
  
  eda_time <- as.POSIXct(pull(eda[1, 1]), tz = "America/Chicago")
  hr_time <- as.POSIXct(pull(hr[1, 1]), tz = "America/Chicago")
  temp_time <- as.POSIXct(pull(temp[1, 1]), tz = "America/Chicago")
  
  eda <- eda[-c(1:2), ]
  hr <- hr[-c(1:2), ]
  temp <- temp[-c(1:2), ]
  
  # measure taken at 4Hz, or 4x per second
  eda_adj <- eda %>% mutate(date_time = eda_time + milliseconds(row_number() -
                                                                  1) * 250) %>% mutate(timestamp = floor_date(date_time, unit = "second"))
  eda_adj <- eda_adj %>% group_by(timestamp) %>% summarize(eda = mean(X1))
  
  # measure taken once per second
  hr_adj <- hr %>% mutate(timestamp = hr_time + seconds(row_number() - 1), hr = X1) %>% select(-X1)
  
  # measure taken at 4Hz, or 4x per second
  temp_adj <- temp %>% mutate(date_time = temp_time + milliseconds(row_number() -
                                                                     1) * 250) %>% mutate(timestamp = floor_date(date_time, unit = "second"))
  temp_adj <- temp_adj %>% group_by(timestamp) %>% summarize(temp = mean(X1))
  
  # Merge everything
  dat <- full_join(eda_adj, hr_adj) %>% full_join(temp_adj) %>% mutate(test = str_) %>% mutate(studentID = stu_)
  
  return(dat)
}

dat <- data.frame()
for (student in c("S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10")) {
  for (exam in c("Midterm 1", "Midterm 2", "Final")) {
    sub_dat <- compile_params(str_ = exam, stu_ = student)
    dat <- bind_rows(dat, sub_dat)
  }
}

dat <- na.omit(dat)
write_csv(dat, "data/phys_data_messy.csv")
corrplot::corrplot(cor(dat[, 2:4], use = "complete.obs"))

ggplot(data = dat,
       aes(x = timestamp, y = temp, color = studentID)) +
  geom_line() + scale_x_datetime(
    date_minor_breaks = "60 min",
    date_breaks = "60 min",
    date_labels = "%H:%M",
    timezone = "America/Chicago"
  ) + facet_wrap(studentID ~ test, scales = "free_x")
# Mostly filter to remove any measurements outside of testing window.

dat_m1 <- dat %>% filter(
    test == "Midterm 1" & 
    timestamp >= as.POSIXct("2018-10-13 09:00:00", tz = "America/Chicago") & 
    timestamp <= as.POSIXct("2018-10-13 10:30:00", tz = "America/Chicago") &
    eda > 0.01  &
      temp >= 25
)

dat_m2 <- dat %>% filter(
    test == "Midterm 2" & 
    timestamp >= as.POSIXct("2018-11-10 09:00:00", tz = "America/Chicago") & 
    timestamp <= as.POSIXct("2018-11-10 10:30:00", tz = "America/Chicago") &
    eda > 0.01  &
      temp >= 25
)

dat_f <- dat %>% filter(
  test == "Final" &
    timestamp >= as.POSIXct("2018-12-05 11:00:00", tz = "America/Chicago") &
    timestamp <= as.POSIXct("2018-12-05 14:00:00", tz = "America/Chicago") &
    eda > 0.01 &
    temp >= 25
) %>% 
  # Remove some drop offs at the end
  mutate(temp =
           case_when(
             test == "Final" &
               timestamp > as.POSIXct("2018-12-05 13:50:00", tz = "America/Chicago") ~ NA,
             TRUE ~ temp
           ))

dat_clean <- bind_rows(dat_m1, dat_m2, dat_f) %>%
  group_by(test, studentID) %>%
  # Smooth curves by student and test
  mutate(
    eda = stats::filter(eda, sides = 2, filter = rep(1 / 120, 120)),
    hr = stats::filter(hr, sides = 2, filter = rep(1 / 120, 120)),
    temp = stats::filter(temp, sides = 2, filter = rep(1 / 120, 120))
  )

dat_clean <- na.omit(dat_clean)

write_csv(dat_clean, "data/phys_data.csv")