gene_x 0 like s 466 view s
Tags: processing, pipeline
preparing gene expression matrix: calculate DESeq2 results
#Input: merged_gene_counts.txt
setwd("/home/jhuang/DATA/Data_Susanne_Carotis_RNASeq/run_2023_GSVA_table/")
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
#BiocManager::install("org.Hs.eg.db")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
d.raw<- read.delim2("merged_gene_counts.txt",sep="\t", header=TRUE, row.names=1)
colnames(d.raw)<-c("gene_name", "leer_mock_2h_r2", "Ace2_mock_2h_r2", "leer_inf_24h_r1", "Ace2_inf_2h_r1", "leer_inf_24h_r2", "leer_inf_2h_r1", "leer_mock_2h_r1", "leer_inf_2h_r2", "Ace2_inf_2h_r2", "Ace2_mock_2h_r1", "Ace2_inf_24h_r2", "Ace2_inf_24h_r1")
col_order <- c("gene_name", "leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2")
reordered.raw <- d.raw[,col_order]
reordered.raw$gene_name <- NULL
#d <- d.raw[rowSums(reordered.raw>3)>2,]
condition = as.factor(c("leer_mock_2h","leer_mock_2h","leer_inf_2h","leer_inf_2h","leer_inf_24h","leer_inf_24h","Ace2_mock_2h","Ace2_mock_2h","Ace2_inf_2h","Ace2_inf_2h","Ace2_inf_24h","Ace2_inf_24h"))
ids = as.factor(c("leer_mock_2h_r1","leer_mock_2h_r2","leer_inf_2h_r1","leer_inf_2h_r2","leer_inf_24h_r1","leer_inf_24h_r2","Ace2_mock_2h_r1","Ace2_mock_2h_r2","Ace2_inf_2h_r1","Ace2_inf_2h_r2","Ace2_inf_24h_r1","Ace2_inf_24h_r2"))
#cData = data.frame(row.names=colnames(reordered.raw), condition=condition, batch=batch, ids=ids)
#dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~batch+condition)
cData = data.frame(row.names=colnames(reordered.raw), condition=condition, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=reordered.raw, colData=cData, design=~condition)
#----more detailed and specific with the following code!----
dds$condition <- relevel(dds$condition, "Ace2_mock_2h")
dds = DESeq(dds, betaPrior=FALSE) # betaPrior default value is FALSE
resultsNames(dds)
preparing selected_geneSets in gsva(exprs, selected_geneSets, method="gsva"). Note that methods are different than methods for nanoString, here are ENSEMBL listed.
#Input: "Signatures.xls" + "Signatures_additional.xls"
library(readxl)
library(gridExtra)
library(ggplot2)
library(GSVA)
# Paths to the Excel files
file_paths <- list("Signatures.xls", "Signatures_additional.xls")
# Get sheet names for each file
sheet_names_list <- lapply(file_paths, excel_sheets)
# Initialize an empty list to hold gene sets
geneSets <- list()
# Loop over each file path and its corresponding sheet names
for (i in 1:length(file_paths)) {
file_path <- file_paths[[i]]
sheet_names <- sheet_names_list[[i]]
# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
# Read the sheet
data <- read_excel(file_path, sheet = sheet)
# Process the GeneSet names (replacing spaces with underscores, for example)
gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
# Add ENSEMBL IDs to the list
geneSets[[gene_set_name]] <- as.character(data$ENSEMBL)
}
}
# Print the result to check
#print(geneSets)
#summary(geneSets)
##desired_geneSets <- c("Monocytes", "Plasma_cells", "T_regs", "Cyt._act._T_cells", "Neutrophils", "Inflammatory_neutrophils", "Suppressive_neutrophils", "LDG", "CD40_activated")
#desired_geneSets <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex", "NLRP3_inflammasome")
#selected_geneSets <- geneSets[desired_geneSets]
selected_geneSets <- geneSets
# Print the selected gene sets
print(selected_geneSets)
prepare violin plots and p-value table for RNA-seq data
# 0. for RNAseq, the GSVA input requires a gene expression matrix where rows are genes and columns are samples. This matrix must be in non-log space.
exprs <- counts(dds, normalized=TRUE)
# 1. Compute GSVA scores:
gsva_scores <- gsva(exprs, selected_geneSets, method="gsva")
# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))
# 3. Add conditions to gsva_df:
gsva_df$Condition <- dds$condition
# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("Ace2_mock_2h", "Ace2_inf_24h"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("Ace2_mock_2h", "Group3", gsva_df_filtered$Condition) #group3=mock
gsva_df_filtered$Condition <- gsub("Ace2_inf_24h", "Group1a", gsva_df_filtered$Condition) #group1a=infection
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
#plot_violin <- function(data, gene_name) {
# # Calculate the t-test p-value for the two conditions
# condition1_data <- data[data$Condition == "Group1a", gene_name]
# condition2_data <- data[data$Condition == "Group3", gene_name]
# p_value <- t.test(condition1_data, condition2_data)$p.value
# # Convert p-value to annotation
# p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
# rounded_p_value <- paste0("p = ", round(p_value, 2))
# plot_title <- gsub("_", " ", gene_name)
# p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
# geom_violin(linewidth=1.2) +
# scale_fill_manual(values = custom_colors) +
# labs(title=plot_title, y="GSVA Score") +
# ylim(-1, 1) +
# theme_light() +
# theme(
# axis.title.x = element_text(size=12),
# axis.title.y = element_text(size=12),
# axis.text.x = element_text(size=10),
# axis.text.y = element_text(size=10),
# plot.title = element_text(size=12, hjust=0.5),
# legend.position = "none" # Hide legend since the colors are self-explanatory
# )
# # Add p-value annotation to the plot
# p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
# return(p)
#}
## 6. Generate the list of plots in a predefined order:
#genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#genes <- genes[match(desired_order, genes)]
#genes <- genes[!is.na(genes)]
#first_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
# -- This following code does not have the colors in the figure --
plot_violin <- function(data, gene_name) {
# Calculate the t-test p-value for the two conditions
condition1_data <- data[data$Condition == "Group1a", gene_name]
condition2_data <- data[data$Condition == "Group3", gene_name]
p_value <- t.test(condition1_data, condition2_data)$p.value
# Convert p-value to annotation
p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
rounded_p_value <- paste0("p = ", round(p_value, 2))
plot_title <- gsub("_", " ", gene_name)
p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name))) +
geom_violin(linewidth=1.2) +
labs(title=plot_title, y="GSVA Score") +
ylim(-1, 1) +
theme_light() +
theme(
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
plot.title = element_text(size=12, hjust=0.5)
)
# Add p-value annotation to the plot
p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
#return(p)
# Return both the plot and the p-value
list(plot = p, p_value = p_value)
}
# 6. Generate the list of plots in a predefined order:
#desired_order <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex", "NLRP3_inflammasome")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
#genes <- genes[match(desired_order, genes)]
#first_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
# Correct the creation of p_values_df
p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))
# Calculate adjusted p-values
#p_values_df$Adjusted_P_Value <- p.adjust(p_values_df$P_Value, method = "fdr")
write.xlsx(p_values_df, "p_values_df.xlsx")
5.1. preparing selected_geneSets in gsva(exprs, selected_geneSets, method="gsva") for NanoString
#Input: Signatures.xls
library(readxl)
library(gridExtra)
library(ggplot2)
library(GSVA)
# Paths to the Excel files
file_paths <- list("Signatures.xls", "Signatures_additional.xls")
# Get sheet names for each file
sheet_names_list <- lapply(file_paths, excel_sheets)
# Initialize an empty list to hold gene sets
geneSets <- list()
# Loop over each file path and its corresponding sheet names
for (i in 1:length(file_paths)) {
file_path <- file_paths[[i]]
sheet_names <- sheet_names_list[[i]]
# Loop over each sheet, extract the ENSEMBL IDs, and add to the list
for (sheet in sheet_names) {
# Read the sheet
data <- read_excel(file_path, sheet = sheet)
# Process the GeneSet names (replacing spaces with underscores, for example)
gene_set_name <- gsub(" ", "_", unique(data$GeneSet)[1])
# Add ENSEMBL IDs to the list
geneSets[[gene_set_name]] <- unique(as.character(data$geneSymbol))
}
}
# Print the result to check
summary(geneSets)
#desired_geneSets <- c("Monocytes", "Plasma_cells", "T_regs", "Cyt._act._T_cells", "Neutrophils", "Inflammatory_neutrophils", "Suppressive_neutrophils", "LDG", "CD40_activated")
desired_geneSets <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex", "NLRP3_inflammasome", "T_cells", "Monocytes","Plasma_cells","T_regs","Cyt._act._T_cells","Neutrophils","Inflammatory_neutrophils","Suppressive_neutrophils","LDG","CD40_activated")
selected_geneSets <- geneSets[desired_geneSets]
# Print the selected gene sets
print(selected_geneSets)
5.2. prepare violin plots and p-value table for NanoString data (Group 3 vs Group 1)
#library(rmarkdown); render("run.Rmd", c("html_document")) under jhuang@hamburg:~/DATA/Data_Susanne_Carotis_spatialRNA_PUBLISHING/run_2023_2_GSVA_table with R
# 0. for Nanostring, the GSVA input requires a gene expression matrix 'exprs' where rows are genes and columns are samples. This matrix must be in non-log space.
exprs <- exprs(target_m666Data) #18677 45
#exprs <- exprs(filtered_or_neg_target_m666Data) #2274 45
# 1. Compute GSVA scores:
gsva_scores <- gsva(exprs, selected_geneSets, method="gsva")
# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))
# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Grp
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID
# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1", "3"), ]
# 5. Define a function to plot violin plots:
# Define custom colors
custom_colors <- c("Group1" = "lightblue", "Group1a" = "red", "Group3" = "grey")
#To implement the custom colors, and make the adjustments to abbreviate "Inflammatory" and "Suppressive", as well as increase the font size for the groups on the x-axis, we can modify the plot_violin function as follows:
gsva_df_filtered$Condition <- gsub("1", "Group1", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1", "Group3"))
plot_violin <- function(data, gene_name) {
# Calculate the t-test p-value for the two conditions
condition1_data <- data[data$Condition == "Group1", gene_name]
condition2_data <- data[data$Condition == "Group3", gene_name]
p_value <- t.test(condition1_data, condition2_data)$p.value
# Convert p-value to annotation
p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
rounded_p_value <- paste0("p = ", round(p_value, 2))
plot_title <- gsub("_", " ", gene_name)
p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
geom_violin(linewidth=1.2) +
scale_fill_manual(values = custom_colors) +
labs(title=plot_title, y="GSVA Score") +
ylim(-1, 1) +
theme_light() +
theme(
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
plot.title = element_text(size=12, hjust=0.5),
legend.position = "none" # Hide legend since the colors are self-explanatory
)
# Add p-value annotation to the plot
p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
#return(p)
# Return both the plot and the p-value
list(plot = p, p_value = p_value)
}
library(openxlsx)
desired_order <- c("IFN", "TNF", "IL-6R_complex", "IL-1_cytokines", "Pro-inflam._IL-1", "Monocyte_secreted", "Apoptosis", "NFkB_complex", "NLRP3_inflammasome", "T_cells", "Monocytes","Plasma_cells","T_regs","Cyt._act._T_cells","Neutrophils","Inflammatory_neutrophils","Suppressive_neutrophils","LDG","CD40_activated")
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
genes <- genes[!is.na(genes)]
#second_row_plots <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene))
# Correct the creation of p_values_df
p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))
write.xlsx(p_values_df, "p_values_df_Group3_vs_Group1_19sig.xlsx")
5.3. prepare violin plots and p-value table for NanoString data (Group 3 vs Group 1a)
# 2. Convert to data.frame for ggplot:
gsva_df <- as.data.frame(t(gsva_scores))
# 3. Add conditions to gsva_df:
identical(rownames(pData(target_m666Data)), rownames(gsva_df))
gsva_df$Condition <- pData(target_m666Data)$Group
#identical(rownames(gsva_df_filtered), rownames(pData(target_m666Data)) )
gsva_df$SampleID <- pData(target_m666Data)$SampleID
# 4. Filter the gsva_df to retain only the desired conditions:
#group 1 vs. group 3 in the nanostring data
gsva_df_filtered <- gsva_df[gsva_df$Condition %in% c("1a", "3"), ]
# 5. Define a function to plot violin plots:
# Update the condition levels in gsva_df_filtered to ensure the desired order on x-axis:
gsva_df_filtered$Condition <- gsub("1a", "Group1a", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- gsub("3", "Group3", gsva_df_filtered$Condition)
gsva_df_filtered$Condition <- factor(gsva_df_filtered$Condition, levels = c("Group1a", "Group3"))
plot_violin <- function(data, gene_name) {
# Calculate the t-test p-value for the two conditions
condition1_data <- data[data$Condition == "Group1a", gene_name]
condition2_data <- data[data$Condition == "Group3", gene_name]
p_value <- t.test(condition1_data, condition2_data)$p.value
# Convert p-value to annotation
p_annotation <- ifelse(p_value < 0.01, "**", ifelse(p_value < 0.05, "*", ""))
rounded_p_value <- paste0("p = ", round(p_value, 2))
plot_title <- gsub("_", " ", gene_name)
p <- ggplot(data, aes(x=Condition, y=!!sym(gene_name), fill=Condition)) +
geom_violin(linewidth=1.2) +
scale_fill_manual(values = custom_colors) +
labs(title=plot_title, y="GSVA Score") +
ylim(-1, 1) +
theme_light() +
theme(
axis.title.x = element_text(size=12),
axis.title.y = element_text(size=12),
axis.text.x = element_text(size=10),
axis.text.y = element_text(size=10),
plot.title = element_text(size=12, hjust=0.5),
legend.position = "none" # Hide legend since the colors are self-explanatory
)
# Add p-value annotation to the plot
p <- p + annotate("text", x=1.5, y=0.9, label=paste0(p_annotation, " ", rounded_p_value), size=5, hjust=0.5)
#return(p)
# Return both the plot and the p-value
list(plot = p, p_value = p_value)
}
# 6. Generate the list of plots in a predefined order:
genes <- colnames(gsva_df_filtered)[!colnames(gsva_df_filtered) %in% "Condition"]
genes <- genes[match(desired_order, genes)]
genes <- genes[!is.na(genes)]
# Correct the creation of p_values_df
p_values_list <- lapply(genes, function(gene) plot_violin(gsva_df_filtered, gene)$p_value)
p_values_df <- data.frame(Gene = genes, P_Value = unlist(p_values_list))
write.xlsx(p_values_df, "p_values_df_Group3_vs_Group1a_19sig.xlsx")
点赞本文的读者
还没有人对此文章表态
没有评论
RNA-seq Tam on CP059040.1 (Acinetobacter baumannii strain ATCC 19606)
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
Typing of 81 S. epidermidis samples (Luise)
Co-Authorship Network Generator using scraped data from Google Scholar via SerpAPI
© 2023 XGenes.com Impressum