gene_x 0 like s 68 view s
Tags: pipeline
Generate protein fasta
python3 getProteinSequences_1.py > Proximity_labelling.fasta
python3 getProteinSequences_2.py > ALFA_pulldown.fasta
python3 getProteinSequences_3.py > schnittmenge.fasta
Using eggnog_env to generate the intial annotations
mamba activate eggnog_env
emapper.py -i Proximity_labelling.fasta -o eggnog_out_Proximity_labelling --cpu 60 #--resume
emapper.py -i ALFA_pulldown.fasta -o eggnog_out_ALFA_pulldown --cpu 60
emapper.py -i schnittmenge.fasta -o eggnog_out_schnittmenge --cpu 60
Using BLAST2GO to generate GO terms and EC terms
Using R to merge the two files blast2go_annot.annot2_ and eggnog_out.emapper.annotations.txt
#mamba activate r_env
# PREPARING go_terms and ec_terms: annot_* file: cut -f1-2 -d$'\t' blast2go_annot.annot2 > blast2go_annot.annot2_
# PREPARING cp eggnog_out.emapper.annotations eggnog_out.emapper.annotations.txt. Note that the first last 3 lines starting with "#" should be removed; '#query' to query
# Load required libraries
library(openxlsx) # For Excel file handling
library(dplyr) # For data manipulation
library(clusterProfiler) # For KEGG and GO enrichment analysis
#library(org.Hs.eg.db) # Replace with appropriate organism database
#BiocManager::install("GO.db")
#BiocManager::install("AnnotationDbi")
library(GO.db)
library(AnnotationDbi)
# Step 1: Load the blast2go annotation file with a check for missing columns
annot_df <- read.table("/home/jhuang/b2gWorkspace_schnittmenge/blast2go_annot.annot2_",
header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
# If the structure is inconsistent, we can make sure there are exactly 3 columns:
colnames(annot_df) <- c("GeneID", "Term")
# Step 2: Filter and aggregate GO and EC terms as before
go_terms <- annot_df %>%
filter(grepl("^GO:", Term)) %>%
group_by(GeneID) %>%
summarize(GOs = paste(Term, collapse = ","), .groups = "drop")
ec_terms <- annot_df %>%
filter(grepl("^EC:", Term)) %>%
group_by(GeneID) %>%
summarize(EC = paste(Term, collapse = ","), .groups = "drop")
eggnog_data <- read.delim("~/DATA/Data_Michelle_ProteinClustering/eggnog_out_schnittmenge.emapper.annotations.txt", header = TRUE, sep = "\t")
# Perform the left joins and rename columns
eggnog_data_updated <- eggnog_data %>%
left_join(go_terms, by = "GeneID") %>%
left_join(ec_terms, by = "GeneID")
#%>% select(-EC.x, -GOs.x)
#%>% rename(EC = EC.y, GOs = GOs.y)
# Create a new workbook
wb <- createWorkbook()
# Add the complete dataset as the first sheet (with annotations)
addWorksheet(wb, "Complete_Data")
writeData(wb, "Complete_Data", eggnog_data_updated)
# Save the workbook to a file
saveWorkbook(wb, "Gene_with_Annotations_schnittmenge.xlsx", overwrite = TRUE)
#Manually DELETE GOs.x and EC.x and rename GOs.y to GOs, EC.y to EC.
Draw group bar plot for relative occurrence
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# COG annotations for the two gene sets
proximity_labelling = [
'D', 'S', 'K', 'S', 'T', 'S', 'S', 'F', 'E', 'J', 'G', 'C', 'C', 'S', 'I', 'L', 'I', 'S', 'S', 'G', 'O', 'H', 'IQ', 'L', 'C', 'H', 'C', 'NU', 'O', 'C',
'M', 'S', 'IQ', 'G', 'S', 'M', 'M', '-', 'M', 'S', 'J', 'S', 'IQ', 'K', 'E', 'J', 'K', 'E', 'J', 'S', 'K', 'E', 'S', 'J', 'S', 'V', 'C', 'I', 'K', 'IQ',
'M', 'S', 'T', 'P', 'C', 'J', 'S', 'E', 'C', 'K', 'H', 'S', 'H', 'H'
]
alfa_pulldown = [
'K', 'S', 'P', 'K', 'C', 'K', 'I', 'O', '-', 'F', 'C', 'P', 'HQ', '-', 'L', 'L', 'S', 'G', 'IQ', 'S'
]
# Define all possible COG categories (including the combinations)
all_cog_categories_ = ['J','A', 'K', 'L', 'B', 'D', 'Y', 'V', 'T', 'M', 'N', 'Z', 'W', 'U', 'O', 'C','G', 'E', 'F', 'H', 'I', 'P', 'Q', 'R', 'S']
all_cog_categories = all_cog_categories_[::-1]
# Define functional groups for COG categories
functional_groups = {
"INFORMATION STORAGE AND PROCESSING": ['J', 'A', 'K', 'L', 'B'],
"CELLULAR PROCESSES AND SIGNALING": ['D', 'Y', 'V', 'T', 'M', 'N', 'Z', 'W', 'U', 'O'],
"METABOLISM": ['C', 'G', 'E', 'F', 'H', 'I', 'P', 'Q'],
"POORLY CHARACTERIZED": ['R', 'S']
}
# Function to count occurrences of each COG category in a gene set, handling combinations
def count_cog_categories(gene_set):
counts = {cog: gene_set.count(cog) for cog in all_cog_categories}
# Treat combinations (e.g., IQ, HQ, NU) as belonging to both respective categories
combination_categories = {
'IQ': ['I', 'Q'],
'HQ': ['H', 'Q'],
'NU': ['N', 'U']
}
for comb, categories in combination_categories.items():
comb_count = gene_set.count(comb)
for category in categories:
counts[category] += comb_count
counts[comb] = 0 # Remove the combination itself as it is distributed
return counts
# Count COG categories for each gene set
proximity_counts = count_cog_categories(proximity_labelling)
alfa_counts = count_cog_categories(alfa_pulldown)
# Calculate relative occurrence (percentage) of each COG category for both gene sets
total_proximity = len(proximity_labelling)
total_alfa = len(alfa_pulldown)
# Calculate percentages for each COG category in both gene sets
proximity_percentage = {cog: count / total_proximity * 100 for cog, count in proximity_counts.items()}
alfa_percentage = {cog: count / total_alfa * 100 for cog, count in alfa_counts.items()}
# Create a DataFrame to store the percentages for both gene sets
df_percentage = pd.DataFrame({
'Proximity Labelling (%)': [proximity_percentage.get(cog, 0) for cog in all_cog_categories],
'ALFA Pulldown (%)': [alfa_percentage.get(cog, 0) for cog in all_cog_categories]
}, index=all_cog_categories)
## Reverse the order of the COG categories
#df_percentage = df_percentage[::-1]
# Round the values to 1 decimal place
df_percentage = df_percentage.round(1)
# Save the relative occurrence data in a text file
file_path = "COG_relative_occurrence.txt"
df_percentage.to_csv(file_path, sep='\t', header=True, index=True)
# Save the relative occurrence data in an Excel file
file_path = "COG_relative_occurrence.xlsx"
df_percentage.to_excel(file_path, header=True, index=True)
# Display file path where the data was saved
print(f"Relative occurrence data has been saved to {file_path}")
# --------------------
# Horizontal Grouped Bar Plot
# --------------------
# Plot a horizontal grouped bar chart
fig, ax = plt.subplots(figsize=(12, 8))
# Create the positions for each group of bars
bar_height = 0.35
index = np.arange(len(df_percentage))
# Plot bars for both gene sets
bar1 = ax.barh(index, df_percentage['Proximity Labelling (%)'], bar_height, label='Proximity Labelling', color='skyblue')
bar2 = ax.barh(index + bar_height, df_percentage['ALFA Pulldown (%)'], bar_height, label='ALFA Pulldown', color='lightgreen')
# Customize the plot
ax.set_ylabel('COG Categories')
ax.set_xlabel('Relative Occurrence (%)')
ax.set_title('Relative COG Category Distribution')
ax.set_yticks(index + bar_height / 2)
ax.set_yticklabels(all_cog_categories)
# Annotate functional groups
#, fontweight='bold'
for group, categories in functional_groups.items():
# Find indices of the categories in this group
group_indices = [all_cog_categories.index(cog) for cog in categories]
# Add a label for the functional group
ax.annotate(group, xy=(1, group_indices[0] + 0.5), xycoords='data', fontsize=9, color='black')
# Add legend
ax.legend()
# Show the plot
plt.tight_layout()
plt.show()
点赞本文的读者
还没有人对此文章表态
没有评论
KEGG and GO annotations in non-model organisms
RNA-seq Tam on Acinetobacter baumannii strain ATCC 19606 CP059040.1 (Data_Tam_RNAseq_2024)
Enhanced Visualization of Gene Presence for the Selected Genes in Bongarts_S.epidermidis_HDRNA
© 2023 XGenes.com Impressum