gene_x 0 like s 478 view s
Tags: processing, pipeline
generate_promter_sequences
#!/usr/bin/env python3
#./1_generate_promoter_sequences.py gencode.v43.annotation.gtf.db
import gffutils
from pyfaidx import Fasta
import argparse
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
def extract_promoter_sequences(gencode_db, genome_records, upstream_length=3000, downstream_length=3000):
db = gffutils.FeatureDB(gencode_db, keep_order=True)
promoters = []
for gene in db.features_of_type('gene'): #CDS
if gene.strand == '+':
promoter_start = max(gene.start - upstream_length, 1)
promoter_end = gene.start + downstream_length - 1
else: # gene.strand == '-'
promoter_start = gene.end - downstream_length + 1
promoter_end = min(gene.end + upstream_length, len(genome_records[gene.chrom]))
promoter_seq = str(genome_records[gene.chrom][promoter_start-1:promoter_end])
# commenting the following two lines don't effect the output size, both are 62703 records
#if not set(promoter_seq.upper()) <= {"A", "C", "G", "T"}:
# print(f"Invalid sequence for id {gene.id}: {promoter_seq}")
if gene.strand == '-':
promoter_seq = str(Seq(promoter_seq).reverse_complement())
record = SeqRecord(Seq(promoter_seq.upper()), id=gene.id, description="")
promoters.append(record)
return promoters
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.gtf.db file')
args = parser.parse_args()
genome_records = Fasta('/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa')
promoter_sequences = extract_promoter_sequences(args.gencode_db, genome_records)
SeqIO.write(promoter_sequences, "promoter_sequences_3000_3000.fasta", "fasta")
#62703 containing ENSG00000198804.2
find motifs in promoters
#!/usr/bin/env python3
#./find_motifs_in_promoters.py
import re
from Bio import SeqIO
def find_motif_frequency_and_distribution(fasta_file, output_file, motif_1="GRGGC", motif_2="GCCYR"):
motif_1 = motif_1.replace("R", "[AG]").replace("Y", "[CT]")
motif_2 = motif_2.replace("R", "[AG]").replace("Y", "[CT]")
promoter_sequences = SeqIO.parse(fasta_file, "fasta")
#with open(output_file, 'w') as f:
# f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Positions\tMotif2_Positions\n")
# for promoter in promoter_sequences:
# motif1_positions = [m.start() for m in re.finditer(motif_1, str(promoter.seq))]
# motif2_positions = [m.start() for m in re.finditer(motif_2, str(promoter.seq))]
# line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n"
# f.write(line)
with open(output_file, 'w') as f:
f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Center_Positions\tMotif2_Center_Positions\n")
for promoter in promoter_sequences:
motif1_positions = [(m.start() + len(motif_1) // 2 - 3000) for m in re.finditer(motif_1, str(promoter.seq))]
motif2_positions = [(m.start() + len(motif_2) // 2 - 3000) for m in re.finditer(motif_2, str(promoter.seq))]
line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n"
f.write(line)
if __name__ == "__main__":
find_motif_frequency_and_distribution("promoter_sequences_3000_3000.fasta", "motif_counts_positions.txt")
R codes
#--3.1. draw plots for all motifs
#DELETE in Kate '[' and ']' in motif_counts_positions.txt
data <- read.table("motif_counts_positions.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
data <- rename(data, Motif_GRGGC_Count=Motif1_Count, Motif_GCCYR_Count=Motif2_Count, Motif_GRGGC_Center_Positions = Motif1_Center_Positions, Motif_GCCYR_Center_Positions = Motif2_Center_Positions)
data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
# Histogram of Motif1 counts
png("Histogram_of_Motif_GRGGC_counts.png", width=800, height=800)
hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
dev.off()
# Histogram of Motif2 counts
png("Histogram_of_Motif_GCCYR_counts.png", width=800, height=800)
hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
dev.off()
# # Convert list-like string to actual list of numbers
# data$Motif_GRGGC_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GRGGC_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)]))
# data$Motif_GCCYR_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GCCYR_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)]))
data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")
# Create data frames for each motif
data_motif_GRGGC <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count),
Motif = "Motif GRGGC",
Position = unlist(data$Motif_GRGGC_Center_Positions))
data_motif_GCCYR <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count),
Motif = "Motif GCCYR",
Position = unlist(data$Motif_GCCYR_Center_Positions))
# Combine motif data frames
data_positions <- rbind(data_motif_GRGGC, data_motif_GCCYR)
# Get positions as numeric values, excluding NAs
motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
# Plot density of Motif1 positions
png("Density_of_Motif_GRGGC_positions.png", width=800, height=800)
plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
dev.off()
# Get positions as numeric values, excluding NAs
motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
# Plot density of Motif2 positions
png("Density_of_Motif_GCCYR_positions.png", width=800, height=800)
plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
dev.off()
#--3.2. split data into two dataframe, one is in which PromoterID occurres in filtered_data$Closest.Gene..Ensemble.ID., another is the remaining
# Create a new data frame for rows in 'data' where the PromoterID is in filtered_data's Closest Gene (Ensemble ID)
data_in_filtered <- data[data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ]
data_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_in_filtered$Motif_GRGGC_Center_Positions, toString)
data_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_in_filtered$Motif_GCCYR_Center_Positions, toString)
write.table(data_in_filtered, file="motifs_in_promoters_with_LT_peak.txt", sep="\t", row.names = FALSE)
# Create another data frame for the remaining rows
data_not_in_filtered <- data[!data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ]
data_not_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_not_in_filtered$Motif_GRGGC_Center_Positions, toString)
data_not_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_not_in_filtered$Motif_GCCYR_Center_Positions, toString)
write.table(data_not_in_filtered, file="motifs_in_promoters_without_LT_peak.txt", sep="\t", row.names = FALSE)
#--3.3. draw plots for motifs_in_promoters_with_LT_peak.txt
#DELETE with Kate '"' in motifs_in_promoters_with_LT_peak.txt
data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
# Histogram of Motif_GRGGC counts
png("Histogram_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
dev.off()
# Histogram of Motif_GCCYR counts
png("Histogram_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
dev.off()
data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")
# Create data frames for each motif
data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count),
Motif = "Motif GRGGC",
Position = unlist(data$Motif_GRGGC_Center_Positions))
data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count),
Motif = "Motif GCCYR",
Position = unlist(data$Motif_GCCYR_Center_Positions))
# Combine motif data frames
data_positions <- rbind(data_motif1, data_motif2)
# Get positions as numeric values, excluding NAs
motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
# Plot density of Motif_GRGGC positions
png("Density_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
dev.off()
# Get positions as numeric values, excluding NAs
motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
# Plot density of Motif_GCCYR positions
png("Density_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
dev.off()
#--3.4. draw plots for motifs_in_promoters_without_LT_peak.txt
#DELETE '"' with Kate in motifs_in_promoters_without_LT_peak.txt
data <- read.table("motifs_in_promoters_without_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
# Histogram of Motif_GRGGC counts
png("Histogram_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
dev.off()
# Histogram of Motif_GCCYR counts
png("Histogram_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
dev.off()
data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")
# Create data frames for each motif
data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count),
Motif = "Motif GRGGC",
Position = unlist(data$Motif_GRGGC_Center_Positions))
data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count),
Motif = "Motif GCCYR",
Position = unlist(data$Motif_GCCYR_Center_Positions))
# Combine motif data frames
data_positions <- rbind(data_motif1, data_motif2)
# Get positions as numeric values, excluding NAs
motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
# Plot density of Motif_GRGGC positions
png("Density_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
dev.off()
# Get positions as numeric values, excluding NAs
motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
# Plot density of Motif_GCCYR positions
png("Density_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
dev.off()
#--3.5. assign each promoter to a prefined class according to Total_Motifs and Avg_Motif_Distance_To_TSS
data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
# Add a column for the sum of Motif_GRGGC_Count and Motif_GCCYR_Count
data <- data %>%
mutate(Total_Motifs = Motif_GRGGC_Count + Motif_GCCYR_Count)
# Calculate absolute average Motif_GRGGC distance to TSS
data$Avg_Motif_GRGGC_Distance_To_TSS <- sapply(strsplit(data$Motif_GRGGC_Center_Positions, ", "),
function(x) round(mean(as.numeric(x), na.rm = TRUE)))
# Calculate absolute average Motif_GCCYR distance to TSS
data$Avg_Motif_GCCYR_Distance_To_TSS <- sapply(strsplit(data$Motif_GCCYR_Center_Positions, ", "),
function(x) round(mean(as.numeric(x), na.rm = TRUE)))
# Revised function to calculate the mean of two strings of comma-separated numbers
mean_of_strings <- function(str1, str2) {
# Convert strings of numbers to numeric vectors
vec1 <- as.numeric(strsplit(str1, split = ",")[[1]])
vec2 <- as.numeric(strsplit(str2, split = ",")[[1]])
# Handle cases where vec1 or vec2 is NULL (i.e., when str1 or str2 is an empty string)
if (is.null(vec1)) {
vec1 <- numeric(0)
}
if (is.null(vec2)) {
vec2 <- numeric(0)
}
# Compute the mean of the two vectors
return(round(mean(c(vec1, vec2), na.rm = TRUE)))
}
# Apply the function to each row of the dataframe
data$Avg_Motif_Distance_To_TSS <- mapply(mean_of_strings, data$Motif_GRGGC_Center_Positions, data$Motif_GCCYR_Center_Positions)
# Assign classes based on the given rules
#change the class name as string "Class 1", "Class 2" ... in the following code.
data <- data %>%
mutate(
Class = case_when(
Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000",
Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000",
Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000"
)
)
write.table(data, file="motifs_in_promoters_with_LT_peak_Category.txt", sep="\t", row.names = FALSE)
#~/Tools/csv2xls-0.4/csv_to_xls.py motifs_in_promoters_with_LT_peak_Category.txt -d$'\t' -o motifs_in_promoters_with_LT_peak_Category.xls;
点赞本文的读者
还没有人对此文章表态
没有评论
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
Typing of 81 S. epidermidis samples (Luise)
Co-Authorship Network Generator using scraped data from Google Scholar via SerpAPI
© 2023 XGenes.com Impressum