library(haven)
library(lavaan)

data <- read_dta("/Volumes/Untitled/Noise/2025_11_25_NAKO_noise_data.dta")

data$d_umw5 <- as.ordered(data$d_umw5)
data$age <- as.numeric(data$age)
data$marital <- as.factor(data$marital)
data$education <- as.factor(data$education)
data$site <- as.factor(data$site)
data$srh <- as.numeric(as.character(data$srh))
data$bi_noise <- as.numeric(as.character(data$bi_noise))
data$sex <- as.numeric(as.character(data$sex))
data$migrant <- as.numeric(as.character(data$migrant))
data$pm25_iqr <- as.numeric(as.character(data$pm25_iqr))
data$no2_iqr <- as.numeric(as.character(data$no2_iqr))
data$ndvi_iqr_reversed <- as.numeric(as.character(data$ndvi_iqr_reversed))

library(dplyr)

# recode site factor levels
data$site <- recode_factor(data$site, 
                           `51` = "51", `52` = "51", `53` = "51",  # Berlin → 51
                           `811` = "81", `812` = "81", `813` = "81",  # → 81
                           .default = levels(data$site))  # Keep others unchanged

# List all variables used in path model
vars_in_sem <- c("d_umw5", "srh", "bi_noise", "age", "sex", "marital", "education", "migrant", "site",
                 "pm25_iqr", "no2_iqr", "ndvi_iqr_reversed")

# Subset data to rows with no missing values in SEM variables
data <- data[complete.cases(data[, vars_in_sem]), ]

# Check how many rows remain
nrow(data)

# Convert multi-category variables to dummies
edu_dummies <- model.matrix(~ education - 1, data = data)
marital_dummies <- model.matrix(~ marital - 1, data = data)
site_dummies <- model.matrix(~ site - 1, data = data)

# Bind to dataset and remove original factors
data <- cbind(data, edu_dummies, site_dummies, marital_dummies)
data$education <- NULL
data$site <- NULL
data$marital <- NULL

#Path model - simple first pass
sem_model_simple <- '
  # Paths only - no ~~ (just-identified, df=0 expected)
  srh ~ c*bi_noise + b*d_umw5 + age + sex + migrant + 
      marital2 + marital3 + marital4 + marital5 + 
      education1 + education2 + education4 + education5 + education6 + education7 + education8 + 
      site12 + site21 + site22 + site23 + site31 + site32 + site33 + site41 + site42 + 
      site51 + site61 + site62 + site63 + site71 + site81 + 
      pm25_iqr + no2_iqr + ndvi_iqr_reversed

  d_umw5 ~ a*bi_noise + age + sex + migrant + 
      marital2 + marital3 + marital4 + marital5 + 
      education1 + education2 + education4 + education5 + education6 + education7 + education8 + 
      site12 + site21 + site22 + site23 + site31 + site32 + site33 + site41 + site42 + 
      site51 + site61 + site62 + site63 + site71 + site81 + 
      pm25_iqr + no2_iqr + ndvi_iqr_reversed

  indirect := a*b
  total := c + indirect
'

fit_test <- sem(sem_model_simple, data = data, estimator = "DWLS", 
                ordered = c("d_umw5", "srh"))

# df extraction
fitMeasures(fit_test, "df") 
fitMeasures(fit_test, c("chisq", "df", "cfi", "rmsea", "srmr"))

summary(fit_test, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)

inspect(fit_test, "converged")
inspect(fit_test, "optim")


library(parallel)
n_cores <- detectCores() - 2

# 1. PARALLEL small test (~10 min)
fit_small <- sem(
  sem_model_simple,
  data = data,
  estimator = "DWLS",
  ordered = c("d_umw5", "srh"),
  se = "bootstrap",
  bootstrap = 50,
  parallel = "snow",
  ncpus = n_cores
)

summary(fit_small, standardized = TRUE, ci=TRUE)

# inspect (fit_small)
inspect(fit_small, "converged")
inspect(fit_small, "optim")

# All fit indices from fit_small 
fit_indices <- fitMeasures(fit_small, 
                           c("chisq", "df", "cfi", "tli", "rmsea", "srmr", "rmsea.ci.lower", "rmsea.ci.upper"))

print(fit_indices)


# 3. Full run with 1000 bootstraps
fit_final <- sem(
  sem_model_simple, 
  data = data, 
  estimator = "DWLS", 
  ordered = c("d_umw5", "srh"),
  se = "bootstrap", 
  bootstrap = 1000,
  parallel = "snow",
  ncpus = n_cores
)

summary(fit_final, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)
inspect(fit_final, "converged")
inspect(fit_final, "optim")

# All fit indices from fit_small 
fit_indices <- fitMeasures(fit_final, 
                           c("chisq", "df", "cfi", "tli", "rmsea", "srmr", "rmsea.ci.lower", "rmsea.ci.upper"))

print(fit_indices)

library(dplyr)

# 1. Extract parameters 
params <- parameterEstimates(fit_final, 
                             standardized = TRUE,     
                             ci = TRUE, 
                             boot.ci.type = "perc")   

print(names(params))

# 2. Results summary
results_summary <- data.frame(
  Effect = c("bi_noise → d_umw5 (a)", "d_umw5 → srh (b)", "Direct bi_noise → srh (c)", 
             "Indirect (a×b)", "Total effect"),
  Probit = c(
    params[params$lhs=="d_umw5" & params$rhs=="bi_noise" & params$op=="~", "est"],
    params[params$lhs=="srh" & params$rhs=="d_umw5" & params$op=="~", "est"],
    params[params$lhs=="srh" & params$rhs=="bi_noise" & params$op=="~", "est"],
    params[params$label=="indirect", "est"],
    params[params$label=="total", "est"]
  ),
  SE = c(
    params[params$lhs=="d_umw5" & params$rhs=="bi_noise" & params$op=="~", "se"],
    params[params$lhs=="srh" & params$rhs=="d_umw5" & params$op=="~", "se"],
    params[params$lhs=="srh" & params$rhs=="bi_noise" & params$op=="~", "se"],
    params[params$label=="indirect", "se"],
    params[params$label=="total", "se"]
  ),
  `95_CI_Lower` = c(
    params[params$lhs=="d_umw5" & params$rhs=="bi_noise" & params$op=="~", "ci.lower"],
    params[params$lhs=="srh" & params$rhs=="d_umw5" & params$op=="~", "ci.lower"],
    params[params$lhs=="srh" & params$rhs=="bi_noise" & params$op=="~", "ci.lower"],
    params[params$label=="indirect", "ci.lower"],
    params[params$label=="total", "ci.lower"]
  ),
  `95_CI_Upper` = c(
    params[params$lhs=="d_umw5" & params$rhs=="bi_noise" & params$op=="~", "ci.upper"],
    params[params$lhs=="srh" & params$rhs=="d_umw5" & params$op=="~", "ci.upper"],
    params[params$lhs=="srh" & params$rhs=="bi_noise" & params$op=="~", "ci.upper"],
    params[params$label=="indirect", "ci.upper"],
    params[params$label=="total", "ci.upper"]
  ),
  Z = c(
    params[params$lhs=="d_umw5" & params$rhs=="bi_noise" & params$op=="~", "z"],
    params[params$lhs=="srh" & params$rhs=="d_umw5" & params$op=="~", "z"],
    params[params$lhs=="srh" & params$rhs=="bi_noise" & params$op=="~", "z"],
    params[params$label=="indirect", "z"],
    params[params$label=="total", "z"]
  ),
  P = c(
    params[params$lhs=="d_umw5" & params$rhs=="bi_noise" & params$op=="~", "pvalue"],
    params[params$lhs=="srh" & params$rhs=="d_umw5" & params$op=="~", "pvalue"],
    params[params$lhs=="srh" & params$rhs=="bi_noise" & params$op=="~", "pvalue"],
    params[params$label=="indirect", "pvalue"],
    params[params$label=="total", "pvalue"]
  )
) %>%
  mutate(
    `95_CI` = paste0("(", round(`95_CI.Lower`, 3), "–", round(`95_CI.Upper`, 3), ")")
  )


print(results_summary)

# 3. Full parameter 
params_df <- as.data.frame(params)  # Already has std.all from step 1

# Filter mediation paths + derived effects
main_params <- params_df %>%
  filter(op == "~" & lhs %in% c("srh", "d_umw5") | 
           label %in% c("indirect", "total"))

# Select
export_df <- main_params[, c("lhs", "op", "rhs", "est", "std.all", "se", "ci.lower", 
                             "ci.upper", "z", "pvalue", "label")]
colnames(export_df) <- c("Outcome", "Operator", "Predictor", "Probit", "Std_beta", 
                         "SE", "CI95_Low", "CI95_High", "Z", "P", "Effect_Label")

# 4. Export 
out_dir <- '/Users/claireslesinski/Dropbox/NAKO Research/Noise paper/Final version/Resubmission to Env Res/2nd Revision/Path model results'

write.csv(results_summary, file.path(out_dir, "mediation_summary.csv"), row.names = FALSE)
write.csv(export_df, file.path(out_dir, "path_model_full.csv"), row.names = FALSE)
write.csv(main_params, file.path(out_dir, "params_raw.csv"), row.names = FALSE)  # Bonus: full filtered

cat("Exported 3 files to:", out_dir, "\n")
cat("Key stats:\n")
print(fitMeasures(fit_final, c("chisq", "df", "cfi", "tli", "rmsea", "srmr")))



library(dplyr)

params <- read.csv("/Users/claireslesinski/Dropbox/NAKO Research/Noise paper/Final version/Resubmission to Env Res/2nd Revision/Path model results/params_raw.csv", stringsAsFactors = FALSE)

# Extract rows
path_a <- params[params$lhs == "d_umw5" & params$rhs == "bi_noise", ]
path_b <- params[params$lhs == "srh" & params$rhs == "d_umw5", ]
path_c <- params[params$lhs == "srh" & params$rhs == "bi_noise", ]
indirect <- params[params$label == "indirect", ]
total <- params[params$label == "total", ]

TableS6 <- data.frame(
  Effect = c("bi_noise → d_umw5 (a)", "d_umw5 → srh (b)", "bi_noise → srh (c')", 
             "Indirect (a×b)", "Total effect"),
  β_std = c(path_a$std.all, path_b$std.all, path_c$std.all, indirect$std.all, total$std.all),
  b_raw = c(path_a$est, path_b$est, path_c$est, indirect$est, total$est),
  CI_95 = sprintf("[%.3f, %.3f]", 
                  c(path_a$ci.lower, path_b$ci.lower, path_c$ci.lower, indirect$ci.lower, total$ci.lower),
                  c(path_a$ci.upper, path_b$ci.upper, path_c$ci.upper, indirect$ci.upper, total$ci.upper))
)

print("=== TABLE S6 FOR ENV RES - COPY β_std + CI_95 COLUMNS ===")
print(format(TableS6[, c("Effect", "β_std", "CI_95")], justify="left"))

# EXPORT
out_dir <- "/Users/claireslesinski/Dropbox/NAKO Research/Noise paper/Final version/Resubmission to Env Res/2nd Revision/Path model results"
write.csv(TableS6, file.path(out_dir, "TableS6_final.csv"), row.names=FALSE)
