library(caret)
library(dplyr)
library(effects)
library(gtsummary)
library(MASS)
library(pROC)
library(readxl)
library(rmarkdown)
library(rms)
library(tidyverse)
set.seed(1233)
his analysis started with one file including information of 442 children. This file can be found in the ‘Input files’ folder.
#Loading expression data
read_excel('Input File/BASE_OF.xlsx')->data
#Formatting data
data%>%
mutate(SEX=factor(SEX,levels=c('1','2')),
Age_Group=factor(Age_Group,levels=c('1','2','3')),
Age_months=as.numeric(Age_months),
EDI=factor(EDI,levels=c('1','2','3')),
Obesity=factor(Obesity,levels=c('0','1')),
Anemia=factor(Anemia,levels=c('0','1')),
PhysActiv=factor(PhysActiv,levels=c('0','1')),
KnowFeeding=factor(KnowFeeding,levels=c('1','2','3')),
MotherAge=as.numeric(MotherAge),
EDULevel=factor(EDULevel,levels=c('1','2','3','4','5'))
)->data
data->data_calculator
Firstly, we show a main profile of all variables according to their EDI categories: (1): Normal, (2): Developmental Risk, (3): Developmental Issue
data%>%
tbl_summary(
statistic = list(
all_continuous() ~ "{mean} ({sd})",
all_categorical() ~ "{n} / {N} ({p}%)"
),
digits = all_continuous() ~ 2,
include = c("Code","SEX","Age_Group","Age_months","Obesity" ,
"Anemia" , "PhysActiv","KnowFeeding","MotherAge","EDULevel" ),
by = EDI, # split table by group
missing = "no" # don't list missing data separately
) |>
add_overall()|>
add_n() |> # add column with total number of non-missing observations
add_p(test = all_categorical() ~ "chisq.test") |> # test for a difference between groups
bold_labels()
## The following errors were returned during `add_p()`:
## ✖ For variable `Age_Group` (`EDI`) and "p.value" statistic: The package "cardx"
## (>= 0.2.4) is required.
## ✖ For variable `Age_months` (`EDI`) and "p.value" statistic: The package
## "cardx" (>= 0.2.4) is required.
## ✖ For variable `Anemia` (`EDI`) and "p.value" statistic: The package "cardx"
## (>= 0.2.4) is required.
## ✖ For variable `Code` (`EDI`) and "p.value" statistic: The package "cardx" (>=
## 0.2.4) is required.
## ✖ For variable `EDULevel` (`EDI`) and "p.value" statistic: The package "cardx"
## (>= 0.2.4) is required.
## ✖ For variable `KnowFeeding` (`EDI`) and "p.value" statistic: The package
## "cardx" (>= 0.2.4) is required.
## ✖ For variable `MotherAge` (`EDI`) and "p.value" statistic: The package "cardx"
## (>= 0.2.4) is required.
## ✖ For variable `Obesity` (`EDI`) and "p.value" statistic: The package "cardx"
## (>= 0.2.4) is required.
## ✖ For variable `PhysActiv` (`EDI`) and "p.value" statistic: The package "cardx"
## (>= 0.2.4) is required.
## ✖ For variable `SEX` (`EDI`) and "p.value" statistic: The package "cardx" (>=
## 0.2.4) is required.
| Characteristic | N | Overall N = 4421 |
1 N = 2561 |
2 N = 1591 |
3 N = 271 |
p-value |
|---|---|---|---|---|---|---|
| Code | 442 | 223.33 (128.95) | 191.22 (122.58) | 271.92 (118.83) | 241.63 (154.71) | |
| SEX | 442 | |||||
| Â Â Â Â 1 | 254 / 442 (57%) | 132 / 256 (52%) | 102 / 159 (64%) | 20 / 27 (74%) | ||
| Â Â Â Â 2 | 188 / 442 (43%) | 124 / 256 (48%) | 57 / 159 (36%) | 7 / 27 (26%) | ||
| Age_Group | 442 | |||||
| Â Â Â Â 1 | 184 / 442 (42%) | 134 / 256 (52%) | 47 / 159 (30%) | 3 / 27 (11%) | ||
| Â Â Â Â 2 | 133 / 442 (30%) | 73 / 256 (29%) | 49 / 159 (31%) | 11 / 27 (41%) | ||
| Â Â Â Â 3 | 125 / 442 (28%) | 49 / 256 (19%) | 63 / 159 (40%) | 13 / 27 (48%) | ||
| Age_months | 442 | 22.94 (18.43) | 18.39 (16.77) | 28.53 (19.04) | 33.07 (17.22) | |
| Obesity | 442 | |||||
| Â Â Â Â 0 | 323 / 442 (73%) | 211 / 256 (82%) | 97 / 159 (61%) | 15 / 27 (56%) | ||
| Â Â Â Â 1 | 119 / 442 (27%) | 45 / 256 (18%) | 62 / 159 (39%) | 12 / 27 (44%) | ||
| Anemia | 430 | |||||
| Â Â Â Â 0 | 347 / 430 (81%) | 194 / 250 (78%) | 131 / 155 (85%) | 22 / 25 (88%) | ||
| Â Â Â Â 1 | 83 / 430 (19%) | 56 / 250 (22%) | 24 / 155 (15%) | 3 / 25 (12%) | ||
| PhysActiv | 442 | |||||
| Â Â Â Â 0 | 22 / 442 (5.0%) | 2 / 256 (0.8%) | 12 / 159 (7.5%) | 8 / 27 (30%) | ||
| Â Â Â Â 1 | 420 / 442 (95%) | 254 / 256 (99%) | 147 / 159 (92%) | 19 / 27 (70%) | ||
| KnowFeeding | 441 | |||||
| Â Â Â Â 1 | 220 / 441 (50%) | 93 / 255 (36%) | 108 / 159 (68%) | 19 / 27 (70%) | ||
| Â Â Â Â 2 | 189 / 441 (43%) | 136 / 255 (53%) | 45 / 159 (28%) | 8 / 27 (30%) | ||
| Â Â Â Â 3 | 32 / 441 (7.3%) | 26 / 255 (10%) | 6 / 159 (3.8%) | 0 / 27 (0%) | ||
| MotherAge | 442 | 31.93 (6.69) | 31.77 (6.44) | 31.82 (6.68) | 34.19 (8.75) | |
| EDULevel | 442 | |||||
| Â Â Â Â 1 | 7 / 442 (1.6%) | 4 / 256 (1.6%) | 2 / 159 (1.3%) | 1 / 27 (3.7%) | ||
| Â Â Â Â 2 | 102 / 442 (23%) | 50 / 256 (20%) | 45 / 159 (28%) | 7 / 27 (26%) | ||
| Â Â Â Â 3 | 144 / 442 (33%) | 94 / 256 (37%) | 44 / 159 (28%) | 6 / 27 (22%) | ||
| Â Â Â Â 4 | 5 / 442 (1.1%) | 3 / 256 (1.2%) | 0 / 159 (0%) | 2 / 27 (7.4%) | ||
| Â Â Â Â 5 | 184 / 442 (42%) | 105 / 256 (41%) | 68 / 159 (43%) | 11 / 27 (41%) | ||
| 1 Mean (SD); n / N (%) | ||||||
To analyze the contributions of selected variables to the neurodevelopmental process in children, we used ordinal logistic regression considering the conventional categories used in the EDI: (1),(2), and (3).
# Building up the model
data_model <- polr(EDI ~ SEX+Age_Group+Age_months+Obesity+
Anemia+PhysActiv+KnowFeeding+MotherAge+EDULevel,
data = data, Hess = TRUE)
# Evaluating the model
# sumamry(data_model)
ctable <- coef(summary(data_model))
pvalores <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
ctable <- cbind(ctable, "p value" = round(pvalores, 4))
print(ctable)
## Value Std. Error t value p value
## SEX2 -0.571567930 0.22306035 -2.5623914 0.0104
## Age_Group2 0.955020393 0.41857189 2.2816162 0.0225
## Age_Group3 1.112381441 0.81288803 1.3684313 0.1712
## Age_months 0.005407554 0.01814973 0.2979413 0.7657
## Obesity1 0.905697678 0.25616677 3.5355783 0.0004
## Anemia1 -0.250003964 0.29907334 -0.8359286 0.4032
## PhysActiv1 -1.494610284 0.54397891 -2.7475519 0.0060
## KnowFeeding2 -1.117780765 0.23105792 -4.8376647 0.0000
## KnowFeeding3 -1.719715200 0.50100206 -3.4325512 0.0006
## MotherAge -0.010691935 0.01663566 -0.6427118 0.5204
## EDULevel2 1.076458792 0.93942707 1.1458673 0.2519
## EDULevel3 0.339550497 0.93473193 0.3632598 0.7164
## EDULevel4 0.922853364 1.34799375 0.6846125 0.4936
## EDULevel5 0.925955531 0.93689878 0.9883197 0.3230
## 1|2 -0.567455009 1.31176912 -0.4325876 0.6653
## 2|3 2.528139496 1.31277004 1.9258053 0.0541
According to this table, Sex, Age groups, Obesity, Physical Activity, and Knowledge of Feeding contribute to the EDI score (p<0.05). Then, the next models will include only these variables.
In this section, we used previously selected variables to test models using randomly stratified training (80%) and validation (20%) cohorts within our dataset. We run this process 1000 times and plot the summary of the Area Under the Curve (AUC), accuracy and Cohen’s kappa statistics.
# Number of iterations
n_reps <- 1000
# Saving metrics
accuracy_vect <- numeric(n_reps)
kappa_vect <- numeric(n_reps)
labels_acu <- c()
pred_acu <- c()
probs_acu <- data.frame()
for (i in 1:n_reps) {
# Random splitting training/validation cohorts
trainIndex <- createDataPartition(data$EDI, p = 0.8, list = FALSE)
training_data <- data[trainIndex, ]
validation_data <- data[-trainIndex, ]
# Traning the model
data_model <- polr(EDI ~ SEX+Age_Group+Age_months+Obesity+
Anemia+PhysActiv+KnowFeeding+MotherAge+EDULevel,
data = training_data, Hess = TRUE)
# Predicting results
predictions <- predict(data_model, newdata = validation_data)
probes <- predict(data_model, newdata = validation_data, type = "probs")
# Evaluating parameters
cm <- confusionMatrix(predictions, validation_data$EDI)
accuracy_vect[i] <- cm$overall["Accuracy"]
kappa_vect[i] <- cm$overall["Kappa"]
# Saving results
labels_acu <- c(labels_acu, as.character(validation_data$EDI))
pred_acu <- c(pred_acu, as.character(predictions))
probs_acu <- rbind(probs_acu, probes)
}
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
## Warning: glm.fit: probabilidades ajustadas numericamente 0 ou 1 ocorreu
After these 1000 simulations, we obtained a mean value of 66.49% (54.22%-78.82%) accuracy and 31.93% (4.4%-56.59%) Cohen’s kappa statistic
In addition, we can generate the following plots (histograms and AUC curves):
# Histograms from Accuracy and Kappa
png('Results/Accuracy and Kappa.png',width=3800,height = 2200,res=600)
par(mfrow = c(1, 2))
hist(accuracy_vect, main = "Accuracy Distribution", col = "lightblue")
hist(kappa_vect, main = "Kappa Distribution", col = "lightgreen")
dev.off()
## png
## 2
# Generating AUC
# Checking labels
labels_factor <- factor(labels_acu, levels = levels(data$EDI))
# Using pROC
auc_multi <- multiclass.roc(response = labels_factor, predictor = as.matrix(probs_acu))
par(mfrow = c(1, 3))
nivels <- levels(data$EDI)
#Saving plot
png('Results/AUC.png',width=5000,height = 2000,res=600)
par(mfrow = c(1, 3))
for (class in nivels) {
# Setting Binary Labels (1 when it matches, 0 when it doesn't match)
binary_response <- ifelse(labels_factor == class, 1, 0)
if (length(unique(binary_response)) == 2) {
# ROC_curve
roc_class <- roc(binary_response, probs_acu[[class]])
auc_value <- round(auc(roc_class), 3)
# Plotting ROC
plot(roc_class,
main = paste("ROC - Class", class),
col = "blue",
lwd = 2)
# Adding AUC values
text(x = 0.4, y = 0.2, labels = paste("AUC =", round(auc_value,2)), col = "red", cex = 1.2)
} else {
cat("Class", class, "have insufficient data\n")
}
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
dev.off()
## png
## 2
Finally, we can visualize these plots.
Histograms for Accuracy and Kappa statistics
AUC curves
Finally, using this pre-established model, we developed a nomogram including the variables previously presented: Sex, Age groups, Obesity, Physical Activity, and Knowledge of Feeding
# Setting labels
data$SEX <- factor(data$SEX, levels = c(1,2), labels = c("Male","Female"))
data$Age_Group <- factor(data$Age_Group, levels = c(1, 2, 3),
labels = c("<1yo", "1-2yo", "3-4yo"))
data$Obesity <- factor(data$Obesity, levels = c(0, 1), labels = c("No", "Yes"))
data$PhysActiv <- factor(data$PhysActiv, levels = c(0, 1), labels = c("No", "Yes"))
data$KnowFeeding <- factor(data$KnowFeeding, levels = c(1,2,3), labels = c("Low", "Intermediate","High"))
label(data$SEX) <- "Sex"
label(data$Age_Group) <- "Age (groups)"
label(data$Obesity) <- "Obesity"
label(data$PhysActiv) <- "Physical Activity"
label(data$KnowFeeding) <- "Knowledge on Feeding"
dd <- datadist(data)
options(datadist = "dd")
# setting the model
rms_model <- lrm(EDI ~ SEX+Age_Group+Obesity+
PhysActiv+KnowFeeding,
data = data)
nom <- nomogram(rms_model,
fun = function(x) plogis(x),
funlabel = "Probability")
#Plotting the nomogram
png('Results/nomogram.png',width=4800,height = 3200,res=600)
plot(nom, xfrac = .4)
dev.off()
## png
## 2
and also created a virtual calculator using shinyPredict
library(shiny)
library(ggplot2)
library(DT)
library(survival)
library(splines)
library(shinythemes)
library(ggthemes)
library(Epi)
# Load list of models
load(file="models.RData")
lv.malli.value<-as.character(seq(tmp.mallit))
names(lv.malli.value)<-sapply(tmp.mallit,function(x){as.character(formula(x))[3]})
tmp.dt.AIC<-data.frame(Model=sapply(tmp.mallit,function(x){as.character(formula(x))[3]}),
AIC=sapply(tmp.mallit,function(x)AIC(x)))
tmp.dt.AIC<-tmp.dt.AIC[order(tmp.dt.AIC$AIC),]
tmp.dt.AIC$Relative.likelihood<-exp((tmp.dt.AIC$AIC[1]-tmp.dt.AIC$AIC)/2)
# Time variable name
lv.i.coxph<-sum(class(tmp.mallit[[1]])%in%'coxph')
lv.time<-gsub(x=strsplit(x=strsplit(x=as.character(formula(tmp.mallit[[1]]))[2],split = "\\(")[[1]][2],
split=",")[[1]][1],pattern = "\\)",replacement = "")
# X-variable names
tmp.otsikko<-names(tmp.mallit)
if(is.null(tmp.otsikko)){Models.otsikko<-paste0("Model ",seq(length(tmp.mallit)))}
if(!is.null(tmp.otsikko)){Models.otsikko<-ifelse(tmp.otsikko=="",paste0("Model ",seq(length(tmp.mallit))),tmp.otsikko)}
# Osa 1 loppu
lv.xvars<-c("SEX", "Age_Group", "Obesity", "PhysActiv", "KnowFeeding")
# Osa 1A alku
# Data frame containing predictions
tee.dt<-function(input){
lv.dt<-expand.grid(
# Osa 1A loppu
SEX=factor(input$SEX,levels=c("Male","Female")),
Age_Group=factor(input$Age_Group,levels=c("<1yo","1-2yo","3-4yo")),
Obesity=factor(input$Obesity,levels=c("No","Yes")),
PhysActiv=factor(input$PhysActiv,levels=c("No","Yes")),
KnowFeeding=factor(input$KnowFeeding,levels=c("Low","Intermediate","High"))# Osa 2 alku
)
lv.class<-class(tmp.mallit[[as.numeric(input$Model)]])
if(sum(lv.class%in%"coxph")>0){
lv.pred<-predict(tmp.mallit[[as.numeric(input$Model)]],newdata=lv.dt,se.fit = TRUE,type="survival")
}
if(sum(lv.class%in%c("lm","glm"))>0){lv.pred<-predict(tmp.mallit[[as.numeric(input$Model)]],newdata=lv.dt,se.fit = TRUE,type="response")}
lv.dt$Predicted<-lv.pred$fit
lv.dt$se.fit<-lv.pred$se.fit
lv.dt$Predicted.lo<-lv.pred$fit-1.96*lv.pred$se.fit
lv.dt$Predicted.hi<-lv.pred$fit+1.96*lv.pred$se.fit
if(sum(lv.class%in%"coxph")>0){
lv.dt$Predicted.lo<-ifelse(lv.dt$Predicted.lo<0,0,lv.dt$Predicted.lo)
}
lv.dt
}
# Define UI for application that draws a histogram
ui <- fluidPage(theme = shinytheme(
"readable"),
# Application title
titlePanel(
"Neurodevelopmental disorder in Children"),
# Sidebar with a slider input
sidebarLayout(
sidebarPanel(
#textAreaInput("AddPlot",label="Add plot script",value="",rows=3),
#actionButton("Toteuta", label="Submit"),
radioButtons("Model",label="Select model",
choices=lv.malli.value,inline=FALSE),
# Osa 2 loppu
radioButtons(inputId="SEX", label="Sex",
choices=c("Male"="Male","Female"="Female"), selected = "Male"),
radioButtons(inputId="Age_Group", label="Age (groups)",
choices=c("<1yo"="<1yo","1-2yo"="1-2yo","3-4yo"="3-4yo"), selected = "<1yo"),
radioButtons(inputId="Obesity", label="Obesity",
choices=c("No"="No","Yes"="Yes"), selected = "No"),
radioButtons(inputId="PhysActiv", label="Physical Activity",
choices=c("No"="No","Yes"="Yes"), selected = "No"),
radioButtons(inputId="KnowFeeding", label="Knowledge on Feeding",
choices=c("Low"="Low","Intermediate"="Intermediate","High"="High"), selected = "Low"), # Osa 3 alku
),
mainPanel(
tabsetPanel(id='tabs',
tabPanel("Plot",plotOutput("distPlot")),
tabPanel("Data", dataTableOutput("Data")),
tabPanel("Summary", verbatimTextOutput("Summary")),
tabPanel("AIC", htmlOutput("AICTab")),
tabPanel("Info",htmlOutput("Info"))
)
)
))
# Define server logic required to draw a histogram
server <- function(input, output, session) {
output$distPlot <- renderPlot({
tmp.dt<-tee.dt(input)
#browser()
colore<-hsv(runif(1), 1, 1)
lv.txt<-paste("lv.p1<-ggplot(tmp.dt,aes(x=1,y=Predicted))+geom_line(color=colore)+geom_point(color=colore)+
geom_errorbar(aes(ymin = tmp.dt$Predicted.lo, ymax = tmp.dt$Predicted.hi),width = 0.1,color=colore)+
geom_ribbon(aes(ymin = tmp.dt$Predicted.lo, ymax = tmp.dt$Predicted.hi), alpha=.2,color=colore)+
coord_flip()+theme_bw()+
geom_text(aes(x=1.2,y=Predicted),label=paste0(round(tmp.dt$Predicted,2),
' (',round(tmp.dt$Predicted.lo,2),'-',
round(tmp.dt$Predicted.hi,2),')'),size=3.5)+
scale_x_continuous(name='',limits=c(0.5,1.5),breaks=c(0,2))+
scale_y_continuous(name='Prediction',limits=c(-0.2,1.2),breaks=c(0,0.25,0.5,0.75,1))")
eval(parse(text=lv.txt))
input$Toteuta
lv.p1
},width=800,height=500,res=300)
output$Data<-DT::renderDataTable({
tmp.dt<-tee.dt(input)
tmp.title<-'Data and predictions'
DT::datatable(tmp.dt,
extensions = 'Buttons',escape=TRUE,
options = list(
pageLength = 50,
dom = 'Blfrtip',
buttons = list(
list(extend = 'copy', title = tmp.title),
list(extend = 'csv', title = tmp.title),
list(extend = 'excel', title = tmp.title),
list(extend = 'pdf', title = tmp.title)
)
))
})
output$AICTab<- renderPrint({
knitr::kable(tmp.dt.AIC,format='html',caption='Compring models with AIC')
})
output$Summary<- renderPrint({
if(sum(class(tmp.mallit[[as.numeric(input$Model)]])%in%"coxph")>0)lv.testi<-cox.zph(tmp.mallit[[as.numeric(input$Model)]])
else{lv.testi<-NULL}
tmp.ulos.S<-list(summary(tmp.mallit[[as.numeric(input$Model)]]),lv.testi)
if(is.null(lv.testi)) {names(tmp.ulos.S)<-c('Model','')}
if(!is.null(lv.testi)) {names(tmp.ulos.S)<-c('Model','Testing PH assumption')}
cat("\n Current model:" ,Models.otsikko[as.numeric(input$Model)],"\n\n")
tmp.ulos.S
})
output$Info<-renderText({
HTML( scan(quiet=TRUE,file="ModelInfo.html",what=""))})
}
# Run the application
shinyApp(ui = ui, server = server)
# Using shinylive and
shinylive::export(appdir = "ShinyApp", destdir = "site")
#httpuv::runStaticServer("docs")
To run the interactive shinyPredict calculator, click below: