gene_x 0 like s 528 view s
Tags: research
There are several R packages available for performing Gene Ontology (GO) enrichment analysis in non-model species. Some of these packages are:
Input for R-script
GO enrichment using topGO
# prepare the *_noFPC.csv files
cut -d$'\t' -f1 blast2go_table_5_noFPC.csv > f1
cut -d$'\t' -f8 blast2go_table_5_noFPC.csv > f8 #replace "; " to ","
# remove the first line!
# Load necessary libraries
library(topGO)
library(data.table) # For reading and handling data efficiently
# List of DEG files (full paths)
deg_files <- c(
"results_1585/star_salmon/15m_vs_IOM_OD0.5.4h-up.txt", "results_1585/star_salmon/1h_vs_IOM_OD0.5.4h-up.txt",
"results_1585/star_salmon/2h_vs_IOM_OD0.5.4h-up.txt", "results_1585/star_salmon/4h_vs_IOM_OD0.5.4h-up.txt",
"results_1585/star_salmon/6h_vs_IOM_OD0.5.4h-up.txt", "results_1585/star_salmon/4d_vs_IOM_OD0.5.4h-up.txt",
"results_1585/star_salmon/15m_vs_IOM_OD0.5.4h-down.txt", "results_1585/star_salmon/1h_vs_IOM_OD0.5.4h-down.txt",
"results_1585/star_salmon/2h_vs_IOM_OD0.5.4h-down.txt", "results_1585/star_salmon/4h_vs_IOM_OD0.5.4h-down.txt",
"results_1585/star_salmon/6h_vs_IOM_OD0.5.4h-down.txt", "results_1585/star_salmon/4d_vs_IOM_OD0.5.4h-down.txt",
"results_1585/star_salmon/15m_vs_IOM_OD0.5-up.txt", "results_1585/star_salmon/1h_vs_IOM_OD0.5-up.txt",
"results_1585/star_salmon/2h_vs_IOM_OD0.5-up.txt", "results_1585/star_salmon/4h_vs_IOM_OD0.5-up.txt",
"results_1585/star_salmon/6h_vs_IOM_OD0.5-up.txt", "results_1585/star_salmon/4d_vs_IOM_OD0.5-up.txt",
"results_1585/star_salmon/15m_vs_IOM_OD0.5-down.txt", "results_1585/star_salmon/1h_vs_IOM_OD0.5-down.txt",
"results_1585/star_salmon/2h_vs_IOM_OD0.5-down.txt", "results_1585/star_salmon/4h_vs_IOM_OD0.5-down.txt",
"results_1585/star_salmon/6h_vs_IOM_OD0.5-down.txt", "results_1585/star_salmon/4d_vs_IOM_OD0.5-down.txt"
)
# Read your geneID to GO mapping
geneID2GO <- readMappings("blast2go_table_5_noFPC_.csv") # Adjust this to your actual gene-to-GO mapping file
# Define GO analysis parameters
p_value_threshold <- 0.01
ontologies <- c("BP", "MF", "CC")
# Loop through each file and perform GO analysis for each ontology
for (file in deg_files) {
# Read DEG file
deg_data <- read.table(file, sep=",", header=1, row.names=1)
myInterestingGenes <- row.names(deg_data)
# Prepare geneList
geneNames <- names(geneID2GO)
geneList <- factor(as.integer(geneNames %in% myInterestingGenes), levels = c(0, 1))
names(geneList) <- geneNames
# Print geneList for debugging
print(paste("Gene list for file:", file))
print(table(geneList)) # This shows the distribution of 0s and 1s
# Perform GO analysis for each ontology
for (onto in ontologies) {
# Create topGOdata object
GOdata <- new("topGOdata",
ontology = onto,
allGenes = geneList,
annot = annFUN.gene2GO,
gene2GO = geneID2GO)
# Run the analysis
resultTopGO <- runTest(GOdata, algorithm = "elim", statistic = "Fisher")
# Generate table with all results to get the number of significant terms
allRes <- GenTable(GOdata, elimKS = resultTopGO, orderBy = "elimKS", topNodes = 150)
significant <- allRes[as.numeric(allRes$elimKS) < p_value_threshold, ]
## Determine the number of top nodes based on the number of significant terms found
#numSignificant <- nrow(significant)
#numTopNodes <- min(numSignificant, 356) # Assuming 356 as a default value for top nodes based on your dataset
# Generate the final results table with the appropriate number of top nodes
#finalRes <- if (numTopNodes > 0) GenTable(GOdata, elimKS = resultTopGO, orderBy = "elimKS", topNodes = numTopNodes) else significant
# Write significant results
output_filename <- sprintf("topGO_results_%s_%s.txt", gsub("results_1585/star_salmon/", "", gsub("\\.txt", "", file)), onto)
write.table(significant, file = output_filename, sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
}
}
#TODO: update the Term containing ... by searching them in quickgo, or next time doing it if necessary for publication!
~/Tools/csv2xls-0.4/csv_to_xls.py *_BP.txt -d$'\t' -o GO_enrichment_BP_p0.01.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py *_MF.txt -d$'\t' -o GO_enrichment_MF_p0.01.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py *_CC.txt -d$'\t' -o GO_enrichment_CC_p0.01.xls;
# ---- seperate of BP, MF and CC (Deprecated) ----
> https://gist.github.com/slavailn/dcff753cc32fd9e053590308c831b057 : topGO_nonModel_organism.R
library(dplyr)
library(ggplot2)
library(reshape2)
library(topGO)
library(plyr)
# Read in some table with gene ids, for example expressions
coreGenes <- read.table("./results_1585/star_salmon/15m_vs_IOM_OD0.5.4h-up.txt", sep=",", header=1, row.names=1)
# Get gene mappings to GO terms
# This table has to look like this:
# FN03409 GO:0009514,GO:0004451,GO:0046872,GO:0006097,GO:0006099,
# FN23295 GO:0009507,GO:0022626,GO:0009514,GO:0005739,
# FN06463 GO:0009514,GO:0004474,GO:0006097,GO:0006099,
# FN16456 GO:0048046,GO:0009507,GO:0009570,GO:0005829,GO:0016020,GO:0005777,
# FN00210 GO:0005576,GO:0003993,GO:0046872,
# FN00543 GO:0005788,GO:0003756,GO:0045454,
# FN26703 GO:0005829,GO:0016020,GO:0005730,GO:0005634,GO:0005524,GO:0004612,
# FN16210 GO:0005829,GO:0016020,GO:0005739,GO:0005634,GO:0005886,GO:0009506,
# FN20048 GO:0009507,GO:0016020,GO:0005777,GO:0009536,GO:0004069,GO:0080130,
# FN26629 GO:0022625,GO:0003723,GO:0003735,GO:0000027,GO:0006412,
geneID2GO <- readMappings("blast2go_table_5_noFPC_.csv")
geneNames <- names(geneID2GO)
# Get the list of genes of interest
myInterestingGenes <- row.names(coreGenes) #coreGenes$id
geneList <- factor(as.integer(geneNames %in% myInterestingGenes))
names(geneList) <- geneNames
head(geneList)
#GOdata <- new("topGOdata", ontology = "BP", allGenes = geneList,
# annot = annFUN.gene2GO, gene2GO = geneID2GO)
# # Run topGO with elimination test
# resultTopGO.elim <- runTest(GOdata, algorithm = "elim", statistic = "Fisher" )
# allRes <- GenTable(GOdata, elimKS = resultTopGO.elim,
# orderBy = "elimKS",
# topNodes = 356)
# write.table(allRes, file = "topGO_results.txt", sep = "\t", quote = F, col.names = T, row.names = F)
# #sig.tab <- GenTable(GOdata, Fis = resultFisher, KS = resultKS, orderBy = "KS", ranksOf = "Fis", topNodes = 20)
# #write.table(sig.tab, file = "topGO_results.txt", sep = "\t", quote = F, col.names = T, row.names = F)
# Run for Biological Process (BP)
GOdata_BP <- new("topGOdata",
ontology = "BP",
allGenes = geneList,
annot = annFUN.gene2GO,
gene2GO = geneID2GO)
resultTopGO.elim_BP <- runTest(GOdata_BP, algorithm = "elim", statistic = "Fisher" )
allRes_BP <- GenTable(GOdata_BP, elimKS = resultTopGO.elim_BP,
orderBy = "elimKS", topNodes = 356)
# Filter results based on p-value threshold
p_value_threshold <- 0.05
significant_BP <- allRes_BP[as.numeric(allRes_BP$elimKS) < p_value_threshold, ]
# Write significant results to file
write.table(significant_BP, file = "topGO_results_BP.txt", sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
# Run for Molecular Function (MF)
GOdata_MF <- new("topGOdata",
ontology = "MF",
allGenes = geneList,
annot = annFUN.gene2GO,
gene2GO = geneID2GO)
resultTopGO.elim_MF <- runTest(GOdata_MF, algorithm = "elim", statistic = "Fisher" )
allRes_MF <- GenTable(GOdata_MF, elimKS = resultTopGO.elim_MF,
orderBy = "elimKS", topNodes = 356)
# Filter results based on p-value threshold
significant_MF <- allRes_MF[as.numeric(allRes_MF$elimKS) < p_value_threshold, ]
# Write significant results to file
write.table(significant_MF, file = "topGO_results_MF.txt", sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
# Run for Cellular Component (CC)
GOdata_CC <- new("topGOdata",
ontology = "CC",
allGenes = geneList,
annot = annFUN.gene2GO,
gene2GO = geneID2GO)
resultTopGO.elim_CC <- runTest(GOdata_CC, algorithm = "elim", statistic = "Fisher" )
options(width = 200) # Adjust the display width
allRes_CC <- GenTable(GOdata_CC, elimKS = resultTopGO.elim_CC,
orderBy = "elimKS", topNodes = 20) #, topNodes = 356
# Filter results based on p-value threshold
significant_CC <- allRes_CC[as.numeric(allRes_CC$elimKS) < p_value_threshold, ]
# Write significant results to file
write.table(significant_CC, file = "topGO_results_CC.txt", sep = "\t", quote = FALSE, col.names = TRUE, row.names = FALSE)
#DEBUG: write the complete Term-Text by manually searching the GO in https://www.ebi.ac.uk/QuickGO/term/GO:0009331
The options of runTest: algorithm and statistic whichAlgorithms()
Classic Algorithm (algorithm = "classic"): This algorithm is based on the classic Fisher's exact test for enrichment analysis. It calculates enrichment based on the hypergeometric distribution. It is suitable for basic enrichment analysis and is widely used.
Elim Algorithm (algorithm = "elim"): The Elim algorithm is an extension of the classic algorithm. It eliminates genes from higher-order GO categories if they are already significant due to enrichment in a lower-order category. This helps to focus on more specific terms and reduce redundancy.
Weight Algorithm (algorithm = "weight"): The Weight algorithm assigns different weights to genes based on their specificity within the GO hierarchy. It considers the number of terms a gene is annotated to and the specificity of those terms. This algorithm aims to prioritize genes annotated to more specific terms.
Weight01 Algorithm (algorithm = "weight01"): This algorithm is similar to the Weight algorithm but uses a different weighting scheme. It assigns weights to genes based on the hierarchical distance from the root of the GO hierarchy. It aims to provide a balance between specificity and coverage.
LEA Algorithm (algorithm = "lea"): LEA stands for "Ontology-based Local Enrichment Analysis." This algorithm integrates local and global GO structures to identify enriched terms. It considers both the local structure around a gene and the global structure of the GO hierarchy.
Parentchild Algorithm (algorithm = "parentchild"): The Parentchild algorithm considers all genes annotated to a given GO term, as well as genes annotated to its parent terms. It accounts for the hierarchical relationships between terms to determine enrichment. This algorithm aims to capture enrichment signals from both specific terms and their parent terms.
whichTests()
explain the output of topGO results
GO.ID Term Annotated Significant Expected elimKS
GO:0019563 glycerol catabolic process 6 5 0.55 3.3e-05
GO:0006355 regulation of transcription, DNA-templat... 101 19 9.22 0.0012
GO:0009401 phosphoenolpyruvate-dependent sugar phos... 18 6 1.64 0.0038
>>> 1281/117 = 6/0.55 (excecpted is 0.55, but we have 5 --> very significant!)
For the GO term "glycerol catabolic process" (GO:0019563):
* Annotated: This indicates that a total of 6 genes in your background set (which contains 2160 genes) are annotated to the "glycerol catabolic process".
* Significant: Among those 6 annotated genes, 5 of them are identified as significant (up-regulated or enriched) in your analysis.
* Expected: The expected number of genes annotated to this GO term by chance is 0.55, based on statistical calculations considering the size of the background set and the total number of genes annotated to all GO terms. 10.909090909090908;10.954446854663773;10.975609756097562 --> that means, the background/genes annotated with GO terms equals 10.9!
* elimKS: The adjusted p-value associated with the enrichment of this GO term is 3.3e-05, indicating a significant enrichment after correcting for multiple testing.
GOdata
------------------------- topGOdata object -------------------------
Description:
-
Ontology:
- BP
2161 available genes (all genes from the array):
- symbol: V0R30_00055 V0R30_00065 V0R30_00110 V0R30_00120 V0R30_00135 ...
- 244 significant genes.
1281 feasible genes (genes that can be used in the analysis):
- symbol: V0R30_00265 V0R30_00275 V0R30_00290 gehD V0R30_00740 ...
- 117 significant genes.
>>> 1281/117.0=10.948717948717949
GO graph (nodes with at least 1 genes):
- a graph with directed edges
- number of nodes = 1785
- number of edges = 3689
------------------------- topGOdata object -------------------------
Merge all down_BP[MF|CC].txt', 'up_BP[MF|CC].txt to merged_GO_enrichments_BP[MF|CC]_p0.01.txt
#merge_files.py
import pandas as pd
import glob
import re # Regular expressions library
# Lists to hold file data and metadata
all_data = []
# Patterns to match the files
patterns = ['*down_CC.txt', '*up_CC.txt']
# Iterate through each pattern and each file that matches the pattern
for pattern in patterns:
for filename in glob.glob(pattern):
# Use regular expressions to extract the 'Comparison' from the filename
# This assumes the filename format is "topGO_results_[comparison]-down_MF.txt" or "topGO_results_[comparison]-up_MF.txt"
match = re.search(r'topGO_results_(.*?)-(down|up)_CC\.txt', filename)
if match:
# Split the comparison string on the first underscore and then join with " vs ", replace remaining underscores with spaces
parts = match.group(1).split('_', 1) # Split only on the first underscore
comparison = ' vs '.join(parts)
comparison = comparison.replace('_', ' ') # Replace remaining underscores with spaces
# Determine the 'Regulation' from the filename
regulation = 'down' if 'down' in filename else 'up'
# Read the current file
current_data = pd.read_csv(filename, sep='\t', header=0) # Adjust delimiter if necessary
# Add the 'Comparison' and 'Regulation' columns
current_data['Comparison'] = comparison
current_data['Regulation'] = regulation
# Append to the list
all_data.append(current_data)
# Concatenate all dataframes into one
merged_data = pd.concat(all_data, ignore_index=True)
# Save to a new file
merged_data.to_csv('merged_GO_enrichments_CC_p0.01.txt', sep='\t', index=False)
Draw bubbleplot by reading merged_GO_enrichments_BP[MF|CC]_p0.01.txt
#Bubble plot mit excel Tabelle als input
library(ggplot2)
library(dplyr)
library(magrittr)
library(tidyr)
library(forcats)
#In the merged_GO_enrichments_CC_p0.01.txt file, change
# Significant to Count (Optional)
# elimKS --> P-value
# .4h --> +4h
# vs vs --> vs
mydat <- read.csv("merged_GO_enrichments_CC_p0.01.txt", sep="\t", header=TRUE)
#mydat$GeneRatio <- sapply(mydat$GeneRatio_frac, function(x) eval(parse(text=x)))
#mydat$FoldEnrichment <- as.numeric(mydat$FoldEnrichment)
mydat$Regulation <- factor(mydat$Regulation, levels=c("up","down"))
mydat$Comparison <- factor(mydat$Comparison, levels=c("15m vs IOM OD0.5", "1h vs IOM OD0.5", "2h vs IOM OD0.5", "4h vs IOM OD0.5", "6h vs IOM OD0.5", "4d vs IOM OD0.5", "15m vs IOM OD0.5+4h", "1h vs IOM OD0.5+4h", "2h vs IOM OD0.5+4h", "4h vs IOM OD0.5+4h", "6h vs IOM OD0.5+4h", "4d vs IOM OD0.5+4h"))
#mydat$Term <- factor(mydat$Term, levels=rev(c("Cell division", "Cell cycle", "Negative regulation of cell proliferation", "Mitotic cell cycle", "Mitotic sister chromatid segregation", "Mitotic spindle organization", "Chromosome segregation", "DNA replication", "DNA repair", "Cellular response to DNA damage stimulus", "Regulation of transcription, DNA-templated", "Positive regulation of transcription, DNA-templated", "Regulation of transcription from RNA polymerase II promoter", "Negative regulation of transcription from RNA polymerase II promoter", "rRNA processing", "Protein folding", "Immune response", "Inflammatory response", "Positive regulation of I-kappaB kinase/NF-kappaB signaling", "Cellular response to tumor necrosis factor", "Chemotaxis", "Neutrophil chemotaxis", "Innate immune response", "Response to virus", "Defense response to virus", "Cellular response to lipopolysaccharide", "Signal transduction", "Response to drug", "Apoptotic process", "Cell adhesion", "Collagen fibril organization", "Nervous system development", "Axon guidance", "Extracellular matrix organization", "Angiogenesis")))
#tiff("Bubble.tiff", units = "in", width = 35, height = 50, res=500)
#png("bubble.png", width=1400, height=600)
# -- Highlight some lines with bold --
#xl <- factor(rev(c("Cell division", "Cell cycle", "Negative regulation of cell proliferation", "Mitotic cell cycle", "Mitotic sister chromatid segregation", "Mitotic spindle organization", "Chromosome segregation", "DNA replication", "DNA repair", "Cellular response to DNA damage stimulus", "Regulation of transcription, DNA-templated", "Positive regulation of transcription, DNA-templated", "Regulation of transcription from RNA polymerase II promoter", "Negative regulation of transcription from RNA polymerase II promoter", "rRNA processing", "Protein folding", "Immune response", "Inflammatory response", "Positive regulation of I-kappaB kinase/NF-kappaB signaling", "Cellular response to tumor necrosis factor", "Chemotaxis", "Neutrophil chemotaxis", "Innate immune response", "Response to virus", "Defense response to virus", "Cellular response to lipopolysaccharide", "Signal transduction", "Response to drug", "Apoptotic process", "Cell adhesion", "Collagen fibril organization", "Nervous system development", "Axon guidance", "Extracellular matrix organization", "Angiogenesis")))
#bold.terms <- c("Innate immune response", "Response to virus", "Defense response to virus")
#bold.labels <- ifelse((xl) %in% bold.terms, yes = "bold", no = "plain")
# #-log10(FDR) can be renamed as 'Significance'
#ggplot(mydat, aes(y = Term, x = Comparison)) + geom_point(aes(color = Regulation, size = Count, alpha = abs(log10(FDR)))) + scale_color_manual(values = c("up" = "red", "down" = "blue")) + scale_size_continuous(range = c(1, 34)) + labs(x = "", y = "", color="Regulation", size="Count", alpha="-log10(FDR)") + theme(axis.text.y = element_text(face = bold.labels))+ theme(axis.text.x = element_text(angle = 30, vjust = 0.5)) + theme(axis.text = element_text(size = 40)) + theme(legend.text = element_text(size = 40)) + theme(legend.title = element_text(size = 40)) + guides(color = guide_legend(override.aes = list(size = 20)), alpha = guide_legend(override.aes = list(size = 20)))
#bold.labels = "plain"
#png("bubble_plot.png", 3000, 2000)
#ggplot(mydat, aes(y = Term, x = Comparison)) +
#geom_point(aes(color = Regulation, size = Count, alpha = abs(log10(P.value)))) +
#scale_color_manual(values = c("up" = "red", "down" = "blue")) +
#scale_size_continuous(range = c(1, 34)) +
#labs(x = "", y = "", color="Regulation", size="Count", alpha="-log10(P.value)") +
#theme(axis.text.y = element_text(face = bold.labels)) +
#theme(axis.text.x = element_text(angle = 30, vjust = 0.5)) +
#theme(axis.text = element_text(size = 40)) +
#theme(legend.text = element_text(size = 40)) +
#theme(legend.title = element_text(size = 40)) +
#guides(color = guide_legend(override.aes = list(size = 20)),
# alpha = guide_legend(override.aes = list(size = 20)))
#dev.off()
bold.labels = "plain"
png("bubble_plot.png", 2000, 700)
ggplot(mydat, aes(y = Term, x = Comparison)) +
geom_point(aes(color = Regulation, size = -log10(P.value))) +
scale_color_manual(values = c("up" = "red", "down" = "blue")) +
scale_size_continuous(range = c(4, 20)) +
labs(x = "", y = "", color="Regulation") +
theme(axis.text.y = element_text(face = bold.labels)) +
theme(axis.text.x = element_text(angle = 30, vjust = 0.5)) +
theme(axis.text = element_text(size = 40)) +
theme(legend.text = element_text(size = 40)) +
theme(legend.title = element_text(size = 40)) +
guides(color = guide_legend(override.aes = list(size = 20)),
alpha = guide_legend(override.aes = list(size = 20)))
dev.off()
#mv bubble_plot.png bubble_plot_CC_p0.01.png
Draw bubbleplot alternative version (https://gist.github.com/slavailn/ef5a1b804ba73300a2cf99b211364976 : enrichment_bubble.R)
library(ggplot2)
library(stringr)
theme_set(
theme_bw() +
theme(legend.position = "right")
)
ggplot(all_go, aes(x = sample_id, y = reorder(GO.label, Enrichment))) +
geom_point(aes(size = Enrichment, fill = P.value), alpha = 0.75, shape = 21) +
facet_wrap(~ regulation) +
theme(axis.text.y = element_text(size=5)) +
theme(axis.text.x = element_text(face="bold")) +
theme(axis.title = element_blank()) +
theme(legend.position = "right") +
scale_fill_gradient(low="bisque", high = "darkred") +
scale_size(range = c(1, 10))
点赞本文的读者
还没有人对此文章表态
没有评论
男性性别决定了实验性中风后IL-17抗体治疗的长期神经保护效果
Standard Bioinformatics Service and Custom Bioinformatics Solutions
© 2023 XGenes.com Impressum