library(cardx)
library(circlize)
library(ComplexHeatmap)
library(corrplot)
library(dendsort)
library(DescTools)
library(dplyr)
library(EnvStats)
library(factoextra)
library(ggplot2)
library(ggpubr)
library(purrr)
library(gtsummary)
library(readxl)
library(scales)
library(survminer)
library(survivalAnalysis)
library(tibble)
library(tidyverse)
library(patchwork)
library(slickR)
library(survival)
library(broom)
set.seed(1234)
options(repos = BiocManager::repositories())
This analysis started with two files. The first one including expression data obtained using the Fluidigm System and the second one including clinical information from these patients. All files can be found in the ‘Input files’ folder.
#Loading expression data
read.csv('Input files/resultados Placa TEG 29_09_23_dXPRESS.csv')->expr_data
expr_data<-expr_data%>%dplyr::filter(!(Name %in% c("PC 8","PC 9","PC 14",
"PC 22","PC 30","PC 45",
"PC 63","775", "3014"))) #Excluding cases with neoadjuvancy and CECs
#Formatting all expression values as numeric data
as.numeric(expr_data$Value)->expr_data$Value
#Loading clinical data
read_excel("Input files/Database EGJA.xlsx",sheet="R")->clinic_data
#Excluding sample without gene expression data.
clinic_data%>%
filter(Name!=3641)->clinic_data
#Filtering data with available gene expression
#sort(table(c(unique(expr_data$Name),
# unique(clinic_data$Name))))
We included 86 patients in this study. Below, we represent the summary statistics.
clinic_data%>%
mutate(Sex=ifelse(Sex==1,"M","F"),
cTNM=ifelse(cTNM<3,"1+2","3+4"))%>%
tbl_summary(
statistic = list(
all_continuous() ~ "{mean} ({min}-{max})",
all_categorical() ~ "{n} / {N} ({p}%)"
),
digits = all_continuous() ~ 2,
include = c("Age","Sex","cTNM", "OS"),
by="Localization",
missing = "no",
label = list(OS ~ "Overall Survival (months)")
) |>
add_overall() |>
add_n() |> # add column with total number of non-missing observations
add_p() |> # test for a difference between groups
modify_header(label = "**Variable**") |> # update the column header
bold_labels()
| Variable | N | Overall N = 861 |
Antrum N = 201 |
Body N = 211 |
Siewert I N = 151 |
Siewert II N = 171 |
Siewert III N = 131 |
p-value2 |
|---|---|---|---|---|---|---|---|---|
| Age | 86 | 65.10 (28.82-89.80) | 64.11 (28.82-89.80) | 64.04 (43.84-84.15) | 63.76 (46.00-83.00) | 68.25 (37.59-88.00) | 65.79 (53.00-75.70) | 0.6 |
| Sex | 86 | 0.10 | ||||||
| F | 35 / 86 (41%) | 11 / 20 (55%) | 11 / 21 (52%) | 2 / 15 (13%) | 6 / 17 (35%) | 5 / 13 (38%) | ||
| M | 51 / 86 (59%) | 9 / 20 (45%) | 10 / 21 (48%) | 13 / 15 (87%) | 11 / 17 (65%) | 8 / 13 (62%) | ||
| cTNM | 86 | <0.001 | ||||||
| 1+2 | 28 / 86 (33%) | 16 / 20 (80%) | 11 / 21 (52%) | 0 / 15 (0%) | 0 / 17 (0%) | 1 / 13 (7.7%) | ||
| 3+4 | 58 / 86 (67%) | 4 / 20 (20%) | 10 / 21 (48%) | 15 / 15 (100%) | 17 / 17 (100%) | 12 / 13 (92%) | ||
| Overall Survival (months) | 86 | 28.24 (0.23-60.00) | 37.90 (0.23-60.00) | 32.19 (2.03-60.00) | 16.26 (1.00-60.00) | 23.47 (0.67-60.00) | 27.06 (0.87-60.00) | 0.2 |
| 1 Mean (Min-Max); n / N (%) | ||||||||
| 2 Kruskal-Wallis rank sum test; Pearson’s Chi-squared test; Fisher’s exact test | ||||||||
Now, we grouped patients in two categories and replicate the summary table
clinic_data%>%
mutate(Sex=ifelse(Sex==1,"M","F"),
cTNM=ifelse(cTNM<3,"1+2","3+4"),
group=ifelse(Localization %in% c('Body','Antrum'),'Body+Antrum','Siewert I-III'))%>%
tbl_summary(
statistic = list(
all_continuous() ~ "{mean} ({min}-{max})",
all_categorical() ~ "{n} / {N} ({p}%)"
),
digits = all_continuous() ~ 2,
include = c("Age","Sex","cTNM", "OS"),
by="group",
missing = "no",
label = list(OS ~ "Overall Survival (months)")
) |>
add_overall() |>
add_n() |> # add column with total number of non-missing observations
add_p() |> # test for a difference between groups
modify_header(label = "**Variable**") |> # update the column header
bold_labels()
| Variable | N | Overall N = 861 |
Body+Antrum N = 411 |
Siewert I-III N = 451 |
p-value2 |
|---|---|---|---|---|---|
| Age | 86 | 65.10 (28.82-89.80) | 64.07 (28.82-89.80) | 66.04 (37.59-88.00) | 0.4 |
| Sex | 86 | 0.020 | |||
| F | 35 / 86 (41%) | 22 / 41 (54%) | 13 / 45 (29%) | ||
| M | 51 / 86 (59%) | 19 / 41 (46%) | 32 / 45 (71%) | ||
| cTNM | 86 | <0.001 | |||
| 1+2 | 28 / 86 (33%) | 27 / 41 (66%) | 1 / 45 (2.2%) | ||
| 3+4 | 58 / 86 (67%) | 14 / 41 (34%) | 44 / 45 (98%) | ||
| Overall Survival (months) | 86 | 28.24 (0.23-60.00) | 34.98 (0.23-60.00) | 22.11 (0.67-60.00) | 0.020 |
| 1 Mean (Min-Max); n / N (%) | |||||
| 2 Wilcoxon rank sum test; Pearson’s Chi-squared test | |||||
Initially, we evaluated survival curves for all features (SWI, SWII, SWIII, Antrum, and Body).
# Creating object with survival info
clinic_data%>%
analyse_survival(vars(OS*30, Death), by='Localization') ->result_srv
# Calling p-value
pluck_survival_analysis(result_srv,"p")->p_val
# Plotting the graph
kaplan_meier_plot(result_srv,
break.time.by="breakByHalfYear",
xlab=".OS.months",
palette = c("Antrum"="gold",
"Body"="red",
"Siewert I"="blue",
"Siewert II"="green",
"Siewert III"="purple",
" "="grey"),
legend.title="All groups",
legend=c(.9,.75),
pval.coord=c(-0.1,.15),
pval.size=2.5,
tables.height = 0.22,
hazard.ratio=TRUE,
risk.table=TRUE,
table.layout="clean",
ggtheme=ggplot2::theme_bw(10))->km_plot_1
png(paste0("Results/Results_KM_TODOS _GRUPOS.png"),width=3800,height = 3000,res=600)
km_plot_1$plot / km_plot_1$table +
patchwork::plot_layout(heights = c(4, 1))
dev.off()
## png
## 2
After, we grouped all gastric regions
(Body+Antrum) and repeated the analysis.
# Creating object with survival info
clinic_data%>%
mutate(Localization=ifelse(Localization %in% c('Body','Antrum'),
'Body+Antrum',
Localization))%>%
analyse_survival(vars(OS*30, Death), by='Localization') ->result_srv
# Calling p-value
pluck_survival_analysis(result_srv,"p")->p_val
# Plotting the graph
kaplan_meier_plot(result_srv,
break.time.by="breakByHalfYear",
xlab=".OS.months",
palette = c("Body+Antrum"="#f76002",
"Siewert I"="blue",
"Siewert II"="green",
"Siewert III"="purple",
" "="grey"),
legend.title="All groups",
legend=c(.9,.75),
pval.coord=c(-0.1,.15),
pval.size=2.5,
tables.height = 0.22,
hazard.ratio=TRUE,
risk.table=TRUE,
table.layout="clean",
ggtheme=ggplot2::theme_bw(10))->km_plot_2
png(paste0("Results/Results_KM_four_groups.png"),width=3800,height = 3000,res=600)
km_plot_2$plot / km_plot_2$table +
patchwork::plot_layout(heights = c(4, 1))
dev.off()
## png
## 2
Subsequently, we evaluated the HR for all variables.
# Estimating NAs per variable
na_prop <- colMeans(is.na(clinic_data))
# Selecting variables with 20% (or less) missing
vars_ok <- names(na_prop[na_prop <= 0.2])
# Including outcome and time
vars_ok <- unique(c(vars_ok, "OS", "Death"))
data_filtered <- clinic_data %>%
select(all_of(vars_ok))
# Defining outcome and time variables
surv_obj <- Surv(time = data_filtered$OS, event = data_filtered$Death)
# Excluding other variables
exclude_vars <- c("No", "Name", "Birthdate", "SurgeryDate","DFSDate","Surgery",
"LastFolloupDate", "Recidive_Death_Date","LastStatus")
candidate_vars <- setdiff(names(data_filtered), c("OS", "Death", exclude_vars))
# Univariate Cox
gsub(" ","_",colnames(data_filtered))->colnames(data_filtered)
gsub(" ","_",candidate_vars)->candidate_vars
data_model <- data_filtered %>%
select(all_of(candidate_vars),OS,Death)%>%
mutate(
Sex = as.factor(Sex),
LaurenType = as.factor(LaurenType),
Surgery_type = as.factor(Surgery_type),
cTNM_group = as.factor(cTNM_group),
Morbidities=as.factor(Morbidities),
Localization_number=as.factor(Localization_number),
Years_group = as.factor(Years_group),
cT_group=as.factor(cT_group),
cN_group =as.factor(cN_group),
Chemo_PRE = as.factor(Chemo_PRE),
Surgery_palliative =as.factor(Surgery_palliative),
RessectionType=as.factor(RessectionType),
MacroType=as.factor(MacroType),
PalliativeChemo=as.factor(PalliativeChemo)
)
tbl_uni <- tbl_uvregression(
data = data_model,
method = coxph,
y = Surv(OS, Death),
exponentiate = TRUE
)
tbl_uni_fmt <- tbl_uni %>%
bold_labels() %>%
add_global_p() %>%
modify_header(label = "**Variable**") %>%
modify_spanning_header(
c(estimate, conf.low, conf.high) ~ "**Hazard Ratio (95% CI)**"
)
| Variable | N |
Hazard Ratio (95% CI)
|
p-value | |
|---|---|---|---|---|
| HR | 95% CI | |||
| Localization | 86 | 0.015 | ||
| Antrum | — | — | ||
| Body | 2.30 | 0.87, 6.06 | ||
| Siewert I | 5.07 | 1.90, 13.6 | ||
| Siewert II | 3.07 | 1.13, 8.34 | ||
| Siewert III | 2.18 | 0.70, 6.76 | ||
| Age | 86 | 0.99 | 0.97, 1.01 | 0.3 |
| Years_group | 86 | 0.3 | ||
| 0 | — | — | ||
| 1 | 0.75 | 0.43, 1.31 | ||
| Sex | 86 | 0.2 | ||
| 0 | — | — | ||
| 1 | 1.48 | 0.82, 2.66 | ||
| Morbidities | 85 | 0.2 | ||
| 0 | — | — | ||
| 1 | 1.43 | 0.81, 2.54 | ||
| Localization_number | 86 | 0.015 | ||
| 1 | — | — | ||
| 2 | 0.60 | 0.27, 1.36 | ||
| 3 | 0.43 | 0.16, 1.14 | ||
| 4 | 0.45 | 0.21, 0.99 | ||
| 5 | 0.20 | 0.07, 0.53 | ||
| BMI | 85 | 1.01 | 0.95, 1.06 | 0.8 |
| HB | 85 | 0.94 | 0.81, 1.09 | 0.4 |
| Albumin | 74 | 0.62 | 0.37, 1.06 | 0.089 |
| Neutrophils | 85 | 1.00 | 1.00, 1.00 | 0.3 |
| Lymphocytes | 85 | 1.00 | 1.00, 1.00 | 0.8 |
| NLR | 85 | 1.13 | 0.99, 1.28 | 0.10 |
| ASA | 86 | 2.64 | 1.48, 4.70 | 0.001 |
| cT | 85 | 1.96 | 1.40, 2.73 | <0.001 |
| cT_group | 85 | <0.001 | ||
| 0 | — | — | ||
| 1 | 6.43 | 2.72, 15.2 | ||
| cN | 85 | 1.77 | 1.39, 2.26 | <0.001 |
| cN_group | 85 | 0.002 | ||
| 0 | — | — | ||
| 1 | 2.69 | 1.37, 5.29 | ||
| cM | 86 | 6.05 | 2.89, 12.7 | <0.001 |
| cTNM | 86 | 2.47 | 1.72, 3.54 | <0.001 |
| cTNM_group | 86 | <0.001 | ||
| 0 | — | — | ||
| 1 | 3.11 | 1.36, 7.10 | ||
| 2 | 8.84 | 3.79, 20.6 | ||
| Chemo_PRE | 86 | 0.023 | ||
| 0 | — | — | ||
| 1 | 2.11 | 1.14, 3.88 | ||
| Surgery_type | 86 | <0.001 | ||
| 0 | — | — | ||
| 1 | 0.24 | 0.12, 0.48 | ||
| Surgery_palliative | 86 | <0.001 | ||
| 0 | — | — | ||
| 1 | 4.20 | 2.09, 8.42 | ||
| RessectionType | 86 | <0.001 | ||
| 0 | — | — | ||
| 1 | 3.16 | 1.27, 7.86 | ||
| 2 | 4.21 | 1.65, 10.7 | ||
| 3 | 11.5 | 4.35, 30.5 | ||
| LaurenType | 86 | 0.5 | ||
| 0 | — | — | ||
| 1 | 0.82 | 0.45, 1.51 | ||
| HistoGrade | 86 | 1.07 | 0.61, 1.89 | 0.8 |
| MacroType | 71 | 0.067 | ||
| 0 | — | — | ||
| 1 | 1.95 | 0.92, 4.14 | ||
| Size | 83 | 1.21 | 1.10, 1.32 | <0.001 |
| PalliativeChemo | 86 | <0.001 | ||
| 0 | — | — | ||
| 1 | 3.17 | 1.76, 5.69 | ||
| Abbreviations: CI = Confidence Interval, HR = Hazard Ratio | ||||
In this step, we separated data from housekeeping genes in this analysis: A (ACTB), B (B2M), C(GAPDH), D(GUSB), E(HPRT1), F(RPLP0), and G(TFRC). According with an in-parallel analysis ran in NormFinder through dXpress app, genes ‘A’ (ACTB) and ‘E’ (HPRT1) were the most relevant to normalize data. In this plot, their combination is shown as ‘norm’.
expr_data<-expr_data%>%dplyr::select(-ID)
# Using ggplot2 to generate the boxplot+jitterplot
p<-expr_data%>%
subset(Gene %in% c("A","B","C","D","E","F","G"))%>%
pivot_wider(names_from=Gene,values_from=Value)%>%
mutate(norm=(A+E)/2)%>% #Selected genes using normfinder
pivot_longer(!c(Name),names_to="Gene",values_to="Value")%>%
ggplot(aes(x=Gene,y=Value))+geom_boxplot(outlier.shape = NA)+geom_jitter(size=1,width = .15)+
theme_bw()+ stat_mean_sd_text(y.pos = 25,size=3.5)
# Saving this result
png("Results/Housekeeping Profile.png",width = 6200,height = 2500,res=600)
p
dev.off()
## png
## 2
Herein, we will normalize Ct values using ‘A’ (ACTB) and ‘E’ (HPRT1) genes.
norm_data<-expr_data%>%
pivot_wider(names_from=Gene,values_from=Value)%>%
group_by(Name)%>%
mutate(norm=mean(c(A,E)))%>%
pivot_longer(!c(Name,norm),names_to="Gene",values_to="Value")%>%
group_by(Name)%>%
mutate(Value=norm-Value)%>%
select(-norm)%>%
pivot_wider(names_from=Gene,values_from=Value)%>%
select(-A,-B,-C,-D,-E,-F,-G)
# Setting object as data.frame
data.frame(norm_data)->norm_data
# Configuring Name of samples as rownames
norm_data$Name->rownames(norm_data)
png("Results/heatmap_data_norm.png",width=4000,height = 4000,res=600)
heatmap(as.matrix(norm_data[,-1]))
dev.off()
## png
## 2
At this moment, we have expression data of 84 genes from 86
patients. Patient 3641 has no expression data with passed QC.
Blank cells represent missing values.
Before beginning the unsupervised analysis, we need to reduce the number of missing cases in this dataset. To do this, we initially excluded samples with 20% or more missing cases, then reduced genes or patients with 10% or more missing cases. Finally, we screen the database for genes/patients with complete information.
# Excluding genes/patients with over 20% missing data
names(tail(sort(colSums(is.na(norm_data))/nrow(norm_data)),10))->exclude_20
#exclude_20
norm_data_pca<-norm_data%>%
select(-any_of(exclude_20))
norm_data_pca$Name->rownames(norm_data_pca)
names(tail(sort(colSums(is.na(t(norm_data_pca)))/ncol(norm_data_pca)),3))->exclude_20
#exclude_20
norm_data_pca<-norm_data_pca%>%
filter(!(Name %in% exclude_20))
# Excluding genes/patients with over 10% missing data
names(tail(sort(colSums(is.na(norm_data_pca))/nrow(norm_data_pca)),5))->exclude_10
#exclude_10
norm_data_pca<-norm_data_pca%>%
select(-any_of(exclude_10))
norm_data_pca$Name->rownames(norm_data_pca)
colSums(is.na(t(norm_data_pca)))/ncol(norm_data_pca)->exclude_10
#exclude_10
names(exclude_10[which(exclude_10!=0)])->exclude_10
#exclude_10
norm_data_pca<-norm_data_pca%>%
filter(!(Name %in% exclude_10))
At this moment, we have expression data of 69 genes from 54 patients.
List of genes for this analysis: ABCC1, ABCC3, AKT1, ALDH1A1, ALDH3A1, CASP8, CCND1, CD34, CD44, CDH1, CDH5, CDKN1A, CDKN3, CEACAM6, DNMT1 EGFR, ERCC1, FGFR2, HMOX2, IGF1R, IL18, LEF1, MCM2, MDM2, MLH1, MMP1, MUC1, NFE2L2, NFE2L3, OCLN PARP1, PCNA, PECAM1, S100P, PTEN, STAT3, STAT1, SMYD2, WNT5A, VEGFA, TOP1, TGFB1, FAS, CD40, HTRA2 HIF1A, HES1, GSTP1, GSTM1, BAX, IL1B, TP53, SOD2, SNAI2, RIPK3, ANGPTL4, IL6, KAT6A, KRAS, MYC PI3, RB1, STAT5A, SNAI1, PIK3C3, NOTCH1, SLPI, MYD88, LATS2
heatmap(as.matrix(norm_data_pca[,-1]))
For this part, we seek to eliminate genes with low expression (cutoff: -8) and/or low variability (cutoff: 0.8).
# Filtering genes with low expression (-dCT < -8)
expr_filtered <- norm_data_pca[,-1] %>%
select(where(~ mean(.x, na.rm = TRUE) > -8))
After this first filter, we retrieved data of 69 genes from 54 patients.
# Filtering genes with low variability (sd < 0.8)
expr_filtered <- expr_filtered %>%
select(where(~ sd(.x, na.rm = TRUE) > 0.8))
After this filter, we retrieved data of 43 genes from 54 patients.
# Scaling data and running PCA
expr_scaled <- scale(expr_filtered)
pca_result <- prcomp(expr_scaled, center = TRUE, scale. = TRUE)
# Observing components contribution to variance
p<-fviz_eig(pca_result, addlabels = TRUE,ncp = 20)
Herein, we require 10 components to achieve over 70% of the explained variance.
p
## Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
## Ignoring empty aesthetic: `width`.
Using these 10 components, we retrieved the top 10 relevant genes.
p<-fviz_cos2(pca_result, choice = "var", axes = 1:15,top = 10,
fill = "lightgray", color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle=45))
p
Top 10 relevant genes: CD34, CDH1, ALDH3A1, SLPI, IL6, CDH5, CEACAM6, SOD2, PECAM1, IL1B
After that, we plotted all samples according to their values in the first and second components. We also included clinical information to highlight the localization of these samples using ellipses.
# Combining with clinical information
pca_df <- as_tibble(pca_result$x[, 1:2]) %>%
mutate(Name = norm_data_pca$Name)%>%
left_join(clinic_data,'Name')
# Visualizing PCA results with 95% confidence interval for ellipses showing groups
ggplot(pca_df, aes(x = PC1, y = PC2, color = Localization, label = Name)) +
geom_point(size = 3, alpha = 0.8) +
geom_text(vjust = -1.2, size = 2.5, check_overlap = TRUE) +
stat_ellipse(type = "norm", level = 0.95, linetype = "dashed") +
labs(
title = "PCA based on Gene Expression (-dCt)",
x = paste0("PC1 (", scales::percent(summary(pca_result)$importance[2, 1]), " var)"),
y = paste0("PC2 (", scales::percent(summary(pca_result)$importance[2, 2]), " var)")
) +
theme_minimal()
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
set.seed(1234)
#Selecting relevant genes
top_genes <- p$data %>%
arrange(desc(cos2)) %>%
slice_head(n = 10) %>%
pull(name)
# Creating dataframe for heatmap
for_heat <- norm_data_pca %>%
left_join(clinic_data, by = "Name") %>%
select(Name, Localization,
cT,pT,
all_of(top_genes))
df <- for_heat[,-c(1:4)]
row_dend = dendsort(hclust(dist(t(df))))
col_dend = dendsort(hclust(dist(df),"ward.D"))
ha = HeatmapAnnotation(Localization = as.factor(for_heat[,2]),
cT=for_heat[,3],
pT=for_heat[,4],
col = list( Localization = c("Antrum"="yellow",
"Body"="red",
"Siewert I"="blue",
"Siewert II"="green",
"Siewert III"="purple",
" "="grey80"),
cT = colorRamp2(c(min(for_heat[,3], na.rm=TRUE),
max(for_heat[,3], na.rm=TRUE)),
c("white", "#018571")),
pT = colorRamp2(c(min(for_heat[,4], na.rm=TRUE),
max(for_heat[,4], na.rm=TRUE)),
c("white", "#A6611A"))))
p<-Heatmap(t(scale(df)),
name = "Level", #title of legend
column_title = "Samples",
row_title = "Genes",
column_names_gp=gpar(fontsize=7),
row_km=3,
column_km=4,
row_names_gp = gpar(fontsize = 10),
top_annotation = ha,
column_labels = c(rep("",nrow(expr_filtered))) #Removing column (sample) names
)
p
Analysis with data from 86 patients.
set.seed(1234)
# Creting dataframe with clinical data
for_heat <- norm_data %>%
left_join(clinic_data, by = "Name") %>%
select(Name, Localization,
cT,pT,
all_of(top_genes))
# Organizing top annotation
df <- for_heat[,-c(1:4)]
row_dend = dendsort(hclust(dist(t(df))))
col_dend = dendsort(hclust(dist(df),"ward.D"))
ha = HeatmapAnnotation(Localization = as.factor(for_heat[,2]),
cT=for_heat[,3],
pT=for_heat[,4],
col = list( Localization = c("Antrum"="yellow",
"Body"="red",
"Siewert I"="blue",
"Siewert II"="green",
"Siewert III"="purple",
" "="grey80"),
cT = colorRamp2(c(min(for_heat[,3], na.rm=TRUE),
max(for_heat[,3], na.rm=TRUE)),
c("white", "#018571")),
pT = colorRamp2(c(min(for_heat[,4], na.rm=TRUE),
max(for_heat[,4], na.rm=TRUE)),
c("white", "#A6611A"))))
# Plotting Heatmap
p<-Heatmap(t(scale(df)),
name = "Level", #title of legend
column_title = "Samples",
row_title = "Genes",
column_names_gp=gpar(fontsize=7),
row_names_gp = gpar(fontsize = 10),
top_annotation = ha,
column_labels = c(rep("",nrow(norm_data))) #Remove col names
)
# Saving plot
png("Results/Heatmap_Temp_top10.png",width=5500,height = 2200,res=600)
p
dev.off()
## png
## 2
Before beginning the unsupervised analysis, we need to reduce the number of missing cases in this dataset. To do this, we filter for Siewert located samples. Then, we excluded samples with 20% or more missing cases, then reduced genes or patients with 10% or more missing cases. Finally, we screen the database for genes/patients with complete information.
# Filtering Siewert Samples
colnames(norm_data[,-1])->analysis_genes
for_Siewert <- norm_data %>%
left_join(clinic_data, by = "Name") %>%
filter(Localization %in% c('Siewert I',
'Siewert II',
'Siewert III'))%>%
select(Name,
all_of(analysis_genes))
# Excluding genes/patients with over 20% missing data
names(tail(sort(colSums(is.na(for_Siewert))/nrow(for_Siewert)),10))->exclude_20
#exclude_20
norm_data_pca<-for_Siewert%>%
select(-any_of(exclude_20))
norm_data_pca$Name->rownames(norm_data_pca)
names(tail(sort(colSums(is.na(t(norm_data_pca)))/ncol(norm_data_pca)),3))->exclude_20
#exclude_20
norm_data_pca<-norm_data_pca%>%
filter(!(Name %in% exclude_20))
# Excluding genes/patients with over 10% missing data
names(tail(sort(colSums(is.na(norm_data_pca))/nrow(norm_data_pca)),5))->exclude_10
#exclude_10
norm_data_pca<-norm_data_pca%>%
select(-any_of(exclude_10))
norm_data_pca$Name->rownames(norm_data_pca)
colSums(is.na(t(norm_data_pca)))/ncol(norm_data_pca)->exclude_10
#exclude_10
names(exclude_10[which(exclude_10!=0)])->exclude_10
#exclude_10
norm_data_pca<-norm_data_pca%>%
filter(!(Name %in% exclude_10))
At this moment, we have expression data of 69 genes from 34 patients.
List of genes for this analysis: ABCC1, ABCC3, AKT1, ALDH1A1,
ALDH3A1, CASP8, CCND1, CD274, CD34, CD44, CDH1, CDH5, CDKN1A, CDKN3,
CEACAM6 DNMT1, EGFR, ERCC1, FGFR2, HMOX2, IGF1R, IL18, LEF1, MCM2, MDM2,
MLH1, MMP1, MUC1, NFE2L2, NFE2L3 OCLN, PARP1, PCNA, PECAM1, S100P, PTEN,
STAT3, STAT1, SMYD2, WNT5A, VEGFA, TOP1, TGFB1, FAS, CD40 HTRA2, HIF1A,
HES1, GSTP1, GSTM1, BAX, IL1B, TP53, SOD2, SNAI2, RIPK3, ANGPTL4, KAT6A,
KRAS, MYC PI3, RB1, STAT5A, SNAI1, PIK3C3, NOTCH1, SLPI, MYD88,
LATS2
heatmap(as.matrix(norm_data_pca[,-1]))
For this part, we seek to eliminate genes with low expression (cutoff: -8) and/or low variability (cutoff: 0.8).
# Filtering genes with low expression (-dCT < -8)
expr_filtered <- norm_data_pca[,-1] %>%
select(where(~ mean(.x, na.rm = TRUE) > -8))
After this first filter, we retrieved data of 69 genes from 34 patients.
# Filtering genes with low variability (sd < 0.8)
expr_filtered <- expr_filtered %>%
select(where(~ sd(.x, na.rm = TRUE) > 0.8))
After this filter, we retrieved data of 44 genes from 34 patients.
# Scaling data and running PCA
expr_scaled <- scale(expr_filtered)
pca_result <- prcomp(expr_scaled, center = TRUE, scale. = TRUE)
# Observing components contribution to variance
p<-fviz_eig(pca_result, addlabels = TRUE,ncp = 20)
Herein, we require 10 components to achieve over 80% of the explained variance.
p
## Warning in geom_bar(stat = "identity", fill = barfill, color = barcolor, :
## Ignoring empty aesthetic: `width`.
Using these 10 components, we retrieved the top 10 relevant genes.
p<-fviz_cos2(pca_result, choice = "var", axes = 1:15,top = 10,
fill = "lightgray", color = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle=45))
p
Top 10 relevant genes: PECAM1, NOTCH1, IL1B, CDH5, IL18, CDH1, CD34, PI3, MLH1, OCLN
After that, we plotted all samples according to their values in the first and second components. We also included clinical information to highlight the localization of these samples using ellipses.
# Combining with clinical information
pca_df <- as_tibble(pca_result$x[, 1:2]) %>%
mutate(Name = norm_data_pca$Name)%>%
left_join(clinic_data,'Name')
# Visualizing PCA results with 95% confidence interval for ellipses showing groups
ggplot(pca_df, aes(x = PC1, y = PC2, color = Localization, label = Name)) +
geom_point(size = 3, alpha = 0.8) +
geom_text(vjust = -1.2, size = 2.5, check_overlap = TRUE) +
stat_ellipse(type = "norm", level = 0.95, linetype = "dashed") +
labs(
title = "PCA based on Gene Expression (-dCt)",
x = paste0("PC1 (", scales::percent(summary(pca_result)$importance[2, 1]), " var)"),
y = paste0("PC2 (", scales::percent(summary(pca_result)$importance[2, 2]), " var)")
) +
theme_minimal()
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
set.seed(1234)
#Selecting relevant genes
top_genes <- p$data %>%
arrange(desc(cos2)) %>%
slice_head(n = 10) %>%
pull(name)
# Creating dataframe for heatmap
for_heat <- norm_data_pca %>%
left_join(clinic_data, by = "Name") %>%
select(Name, Localization,
cT,pT,
all_of(top_genes))
df <- for_heat[,-c(1:4)]
row_dend = dendsort(hclust(dist(t(df))))
col_dend = dendsort(hclust(dist(df),"ward.D"))
ha = HeatmapAnnotation(Localization = as.factor(for_heat[,2]),
cT=for_heat[,3],
pT=for_heat[,4],
col = list( Localization = c("Antrum"="yellow",
"Body"="red",
"Siewert I"="blue",
"Siewert II"="green",
"Siewert III"="purple",
" "="grey80"),
cT = colorRamp2(c(min(for_heat[,3], na.rm=TRUE),
max(for_heat[,3], na.rm=TRUE)),
c("white", "#018571")),
pT = colorRamp2(c(min(for_heat[,4], na.rm=TRUE),
max(for_heat[,4], na.rm=TRUE)),
c("white", "#A6611A"))))
p<-Heatmap(t(scale(df)),
name = "Level", #title of legend
column_title = "Samples",
row_title = "Genes",
column_names_gp=gpar(fontsize=7),
row_km=3,
column_km=2,
row_names_gp = gpar(fontsize = 10),
top_annotation = ha,
column_labels = c(rep("",nrow(expr_filtered))) #Removing column (sample) names
)
p
Analysis with data from 45 patients.
set.seed(1234)
# Creting dataframe with clinical data
for_heat <- for_Siewert %>%
left_join(clinic_data, by = "Name") %>%
select(Name, Localization,
cT,pT,
all_of(top_genes))
# Organizing top annotation
df <- for_heat[,-c(1:4)]
row_dend = dendsort(hclust(dist(t(df))))
col_dend = dendsort(hclust(dist(df),"ward.D"))
ha = HeatmapAnnotation(Localization = as.factor(for_heat[,2]),
cT=for_heat[,3],
pT=for_heat[,4],
col = list( Localization = c("Antrum"="yellow",
"Body"="red",
"Siewert I"="blue",
"Siewert II"="green",
"Siewert III"="purple",
" "="grey80"),
cT = colorRamp2(c(min(for_heat[,3], na.rm=TRUE),
max(for_heat[,3], na.rm=TRUE)),
c("white", "#018571")),
pT = colorRamp2(c(min(for_heat[,4], na.rm=TRUE),
max(for_heat[,4], na.rm=TRUE)),
c("white", "#A6611A"))))
# Plotting Heatmap
p<-Heatmap(t(scale(df)),
name = "Level", #title of legend
column_title = "Samples",
row_title = "Genes",
column_names_gp=gpar(fontsize=7),
row_names_gp = gpar(fontsize = 10),
top_annotation = ha,
column_labels = c(rep("",nrow(for_Siewert))) #Remove col names
)
# Saving plot
png("Results/Heatmap_Siewert_Temp_top10.png",width=5500,height = 2200,res=600)
p
dev.off()
## png
## 2
In this topic, we compared the analysis of several groups using Kruskal-Wallis, followed by a post-hoc adjustment (Bonferroni method).
# Combining expression with clinical data
colnames(norm_data[,-1])->analysis_genes
DEA_data <- norm_data %>%
left_join(clinic_data, by = "Name") %>%
select(Name,Localization,
all_of(analysis_genes))
# Estimating p-values
data.frame(gene=NA,
Kruskal_Wallis_p_value=NA,
Par=NA,
bonf_p_value=NA)->final_KW
for (coluna in 3:ncol(DEA_data)){
result_KW<-aov(get(colnames(DEA_data)[coluna]) ~ Localization, data = DEA_data)
result_KW2<-kruskal.test(get(colnames(DEA_data)[coluna]) ~ Localization, data = DEA_data)
if(as.numeric(result_KW2$p.value)<0.05){
PostHocTest(result_KW, method = "bonf")->bonf
data.frame(gene=colnames(DEA_data)[coluna],
Kruskal_Wallis_p_value=as.numeric(result_KW2$p.value),
Par=rownames(bonf$Localization),
bonf_p_value=as.matrix(bonf$Localization[,4]))->temp
rbind(final_KW,temp)->final_KW
}
}
final_KW[-1,]->final_KW
# Formatting data
final_KW<-final_KW%>%pivot_wider(names_from=Par,values_from = bonf_p_value)
# Selecting genes with no pairwise expression differences
final_KW%>%
pivot_longer(!gene,names_to="Comparison",values_to="val")%>%
group_by(gene)%>%
summarize(n=sum(val>0.05))%>%
filter(n==10)%>%
select(gene)%>%
as.matrix()->filter_genes_AOV
# Filtering relevant genes
final_KW%>%
filter(!(gene %in% filter_genes_AOV))->final_KW
# Organizing data to be plotted
final_KW$gene->gene_list
as.matrix(as.data.frame(final_KW[,-c(1,2)],
stringsAsFactors = FALSE))->final_KW
rownames(final_KW)<-gene_list
colnames(final_KW)<-c('Gastric Body vs. Antrum','SWI vs. Gastric Antrum','SWII vs. Gastric Antrum',
'SWIII vs. Gastric Antrum','SWI vs. Gastric Body','SWII vs. Gastric Body',
'SWIII vs. Gastric Body','SWI vs. SWII', 'SWI vs. SWIII', 'SWII vs. SWIII')
final_KW[c('STAT3','PI3','EGFR','VEGFA','SOD2','SLPI',
'NOTCH1','TP63','KRAS','IL18','IGF1R','NFE2L2',
'IL12A','MDM2','TGFB1','AKT1','WNT5A','RB1',
'CEACAM6','MYD88'),
c('SWI vs. SWII','SWI vs. SWIII','SWI vs. Gastric Body','SWI vs. Gastric Antrum',
'SWII vs. SWIII','SWII vs. Gastric Body','SWII vs. Gastric Antrum',
'SWIII vs. Gastric Body','SWIII vs. Gastric Antrum','Gastric Body vs. Antrum' )]->final_KW
# Running a corrplot
corrplot(as.matrix(-1*log10(t(final_KW))),
p.mat = as.matrix(t(final_KW)),insig = "blank",
#cl.lim=c(0,10),
is.corr = FALSE,col = COL2('BrBG', 5),
tl.col = 'black',tl.srt = 60,title = "Kruskal-Wallis post hoc Bonferroni p-values (-log10)",
mar = c(1, 1, 5, 1))
A graph was then created showing the most significant p-values in
each case.
In this analysis, we pooled data from gastric body and antrum samples (B+A) to identify genes differentially expressed in only one of the Siewert categories compared to gastric tumor regions. We then performed pairwise comparisons of each Siewert group with the gastric sections. Then we selected genes that showed significant differences (FC>1.5 & p-value<0.05) in only one of these comparisons (Gastric Body+Antrum vs. SWI | SWII | SWIII).
colnames(norm_data[,-1])->analysis_genes
# Combining clinical and molecular data
DEA_data <- norm_data %>%
left_join(clinic_data, by = "Name") %>%
filter(Name !=760)%>% ## CHECAR cOM GASTRO
select(Name,Localization,
all_of(analysis_genes))
# Setting parameters and thresholds
thresh<-1.5 #Foldchange threshold
Testa<-c("Siewert I","Siewert II","Siewert III") #Categories to be tested
Testa_name<-c("SWI","SWII","SWIII") #Setting labels
rm(final_result)
for(k in 1:3){
for(i in 3:ncol(DEA_data)){
DEA_data%>%
filter(Localization %in% c(Testa[k],"Body","Antrum"))%>%
mutate(Group=ifelse(Localization %in% c("Body","Antrum"),
"Body+Antrum",
Testa_name[k]),
expr=get(colnames(DEA_data)[i]))%>%
select(expr,Group)%>%
as.data.frame()->data_test2
data_test2%>%
group_by(Group)%>%
summarise(x=median(na.omit(expr)))->y
compare_means(formula = expr~ Group,data_test2,method="wilcox.test")->x ## Running Kruskal-Wallis two-groups
if(as.numeric(x$p)<0.05 & abs(y$x[1]-y$x[2])>log2(thresh)){ # Setting thesholds
#print(x)
if(exists("final_result")){
rbind(final_result,data.frame(x,
Gene=colnames(DEA_data)[i],
abs_FC=abs(y$x[1]-y$x[2])))->final_result
}else{
data.frame(x,
Gene=colnames(DEA_data)[i],
abs_FC=abs(y$x[1]-y$x[2]))->final_result
}
}
}
}
#Removing duplicated genes (only those p<0.05 and FC>1.5 in one condition)
final_result$Gene[which(! final_result$Gene %in% names(which(table(final_result$Gene)>1)))]->to_select
# Selecting data to be plotted
norm_data %>%
left_join(clinic_data, by = "Name") %>%
filter(Name !=760)%>% ## CHECAR cOM GASTRO
mutate(Group=ifelse(Localization %in% c("Body","Antrum"),
"Body+Antrum",
Localization))%>%
select(all_of(to_select),Group)->data_teste2
# Adding pairwise p-values to compare (using Body+Antrum as a reference)
rm(p_valores)
for(i in 1:length(to_select)){
data_teste2%>%
mutate(Expr=get(colnames(data_teste2)[i]))->data_teste2
if(exists("p_valores")){
rbind(p_valores,
data.frame(Gene=colnames(data_teste2)[i],
compare_means(Expr~Group,data_teste2,ref.group = "Body+Antrum")))->p_valores
}else{
data.frame(Gene=colnames(data_teste2)[i],
compare_means(Expr~Group,data_teste2,ref.group = "Body+Antrum"))->p_valores
}
}
p_valores%>%
mutate(BUSCA=paste0(group2,Gene))->p_valores
norm_data %>%
left_join(clinic_data, by = "Name") %>%
filter(Name !=760)%>% ## CHECAR cOM GASTRO
mutate(Group=ifelse(Localization %in% c("Body","Antrum"),
"Body+Antrum",
Localization))%>%
select(Name,all_of(to_select),Group)%>%
pivot_longer(!c("Name","Group"),names_to="Genes",values_to="Value")%>%
group_by(Group,Genes)%>%
summarise(Value=median(na.omit(Value)))%>%
group_by(Genes)%>%
mutate(normal_val=ifelse(Group=="Body+Antrum",Value,NA),
normal_val=min(na.omit(normal_val)),
Value=(Value-normal_val))%>%
filter(Group!="Body+Antrum")%>%
mutate(BUSCA=paste0(Group,Genes))%>%
left_join(p_valores,"BUSCA")->resultado_finala
# Plotting the final result
resultado_finala%>%
ggplot(aes(x=factor(Genes,
levels=c(resultado_finala%>%
group_by(Genes)%>%
filter(abs(Value)>log2(thresh))%>%
mutate(val_fil=min(p))%>%
filter(val_fil==p,Group=="Siewert I")%>%
select(Genes)%>%as.matrix(),
resultado_finala%>%
group_by(Genes)%>%
filter(abs(Value)>log2(thresh))%>%
mutate(val_fil=min(p))%>%
filter(val_fil==p,Group=="Siewert II")%>%
select(Genes)%>%as.matrix(),
resultado_finala%>%
group_by(Genes)%>%
filter(abs(Value)>log2(thresh))%>%
mutate(val_fil=min(p))%>%
filter(val_fil==p,Group=="Siewert III")%>%
select(Genes)%>%as.matrix())),
y=factor(Group,
levels=c("Body+Antrum",
"Siewert III","Siewert II",
"Siewert I")),
color=Value,size=-1*(log10(p)),
alpha=ifelse( p<0.05 & (abs(Value)>log2(thresh)),1,.5)))+
geom_point()+
geom_point(data = resultado_finala%>%
filter(p<0.05 & (abs(Value)>log2(thresh))),shape=21,color="black")+
theme_bw()+
theme(legend.title = element_text(size=10),
legend.key.height = unit(0.4, 'cm'),
legend.key.width = unit(0.4, 'cm'),
axis.text.x = element_text(face = "italic"))+
guides(
alpha = guide_legend(override.aes = list(size = 4)))+
ylab("Localization")+xlab("Gene")+
scale_color_gradient2(name=bquote(-Delta~Delta~CT~(vs.B+A)),midpoint=0, low="blue", mid="white",
high="red", space ="Lab" )+
scale_alpha_continuous(name="Result",lim=c(0,1),breaks=c(0.1,1),
labels=c("Non-significant","Significant"))+
scale_size_continuous(name="p-value",
breaks=c(-log10(0.05),-log10(0.01),-log10(0.001)),
labels=c("0.05","0.01","0.001"))+
scale_y_discrete(labels=c("Antrum"="Antrum",
"Body"="Body",
"Siewert I"="EGJA (Siewert I)",
"Siewert II"="EGJA (Siewert II)",
"Siewert III"="EGJA (Siewert III)",
" "="NA"))+
theme(axis.text.x = element_text(angle=45,hjust=1))->p
png(paste0("Results/RESULTADOS_EXP_exclusiva_Siewert_",thresh,"_TODAS AMOSTRAS.png"),
width=5200,height = 2000,res=600)
p
dev.off()
## png
## 2
In this topic, we hypothesize that the SWII region is at the transition between SWI and SWIII. Therefore, we expect to see a gradient from more to less (or vice versa) in the expression levels of certain genes between SWI, SWII, and SWIII. This analysis searches for genes with a significant difference in expression (Kruskal-Wallis, p<0.05) between SWI and SWIII. After, among those preselected genes, the system looks for those where the expression of SWII is in the middle of the other regions. Finally, a heatmap is deployed to show those profiles.
colnames(norm_data[,-1])->analysis_genes
# Combining clinical and molecular data
DEA_data <- norm_data %>%
left_join(clinic_data, by = "Name") %>%
filter(Name !=760)%>% ## CHECAR cOM GASTRO
select(Name,Localization,
all_of(analysis_genes))
# Filtering data from SWI-SWIII
DEA_data%>%
filter(Localization %in% c("Siewert I","Siewert II","Siewert III" ))%>%
pivot_longer(!c("Name","Localization"),names_to="Gene",values_to="Values")->clinicos2
# Evaluating regions with small coefficient of variation (CV)
clinicos2%>%
group_by(Localization,Gene)%>%
summarise(coef=sd(na.omit(2^Values))/mean(na.omit(2^Values)))%>%
filter(coef<1,Localization %in% c("Siewert I","Siewert III" ))%>%
group_by(Localization)%>%
arrange(Localization,coef)->clinicos_coef
clinicos2$Localization<-factor(clinicos2$Localization,levels=c("Siewert III",
"Siewert II",
"Siewert I"))
sort(table(clinicos_coef$Gene),decreasing = T)->clinicos_coef2
names(clinicos_coef2[which(clinicos_coef2>1)])->lista_coef
NA->list_genes_selected
# Evaluating genes with genes differentially expressed between SWI and SWIII
# Then, evaluating genes whose SWII are in the middle of SWI and SWIII
for(i in 1:length(lista_coef)){
wilcox.test(Values~Localization,
subset(clinicos2,Gene==lista_coef[i] & Localization %in% c("Siewert I",
"Siewert III" )))->Wilcoxt
if( as.numeric(Wilcoxt$p.value)<0.05){
clinicos2%>%
filter(Gene==lista_coef[i] & Localization %in% c("Siewert I",
"Siewert III" ))%>%
group_by(Localization)%>%
summarise(min=min(na.omit(Values)),
max=max(na.omit(Values)),
median=median(na.omit(Values)))->S1_3
clinicos2%>%
filter(Gene==lista_coef[i] & Localization %in% c("Siewert II" ))%>%
summarise(min=min(na.omit(Values)),
max=max(na.omit(Values)),
median=median(na.omit(Values)))->S2
if( between(S2$median,min(S1_3$median[1],S1_3$median[2]),max(S1_3$median[1],S1_3$median[2]))){
c(list_genes_selected,lista_coef[i])->list_genes_selected
p <- clinicos2 %>%
filter(Gene == lista_coef[i]) %>%
ggplot(aes(Values, Localization)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(alpha = .6, height = .15) +
theme_bw() +
ylab("Localization") +
xlab(paste0(lista_coef[i], " levels (-dCT)"))
ggsave(
filename = paste0("Results/Transition Siewert/", lista_coef[i], ".png"),
plot = p,
width = 5,
height = 2,
dpi = 600
)
}
}
}
## systemfonts and textshaping have been compiled with different versions of Freetype. Because of this, textshaping will not use the font cache provided by systemfonts
# Selecting data for relevant genes
clinicos3<-DEA_data%>%
filter(Localization %in% c("Siewert I","Siewert II","Siewert III" ))%>%
select(Name,Localization,
all_of(list_genes_selected[-1]))
# Plotting the heatmap
df <- scale(clinicos3[,-c(1:2)])
row_dend = dendsort(hclust(dist(t(df))))
col_dend = dendsort(hclust(dist(df)))
ha = HeatmapAnnotation(Localization = as.factor(clinicos3[,2]),
col = list( Localization = c("Antrum"="yellow",
"Body"="red",
"Siewert I"="blue",
"Siewert II"="green",
"Siewert III"="purple",
" "="grey")))
p<-Heatmap(t(df),
name = "Expression", #title of legend
column_title = "Samples",
row_title = "Genes",
column_names_gp=gpar(fontsize=7),
row_dend_reorder = TRUE,
cluster_column_slices = FALSE,
top_annotation = ha,
column_split = factor(clinicos3[,2],levels=c("Siewert I",
"Siewert II",
"Siewert III" ))
)
png("Results/Heatmap_SiewertII.png",width = 4000,height = 2000,res=600)
p
dev.off()
## png
## 2
Then, we can visualize individual boxplots per selected genes:
slickR(list.files("Results/Transition Siewert", pattern="png", full.names=TRUE))
Saving Session Info and R-env…