Defining and Categorizing Promoter Types Based on the 'GRGGC' Motif Frequency and Distance to TSS

gene_x 0 like s 794 view s

Tags: processing, pipeline

generate_promter_sequences

#!/usr/bin/env python3
#./1_generate_promoter_sequences.py gencode.v43.annotation.gtf.db
import gffutils
from pyfaidx import Fasta
import argparse
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

def extract_promoter_sequences(gencode_db, genome_records, upstream_length=3000, downstream_length=3000):
    db = gffutils.FeatureDB(gencode_db, keep_order=True)

    promoters = []
    for gene in db.features_of_type('gene'): #CDS
        if gene.strand == '+':
            promoter_start = max(gene.start - upstream_length, 1)
            promoter_end = gene.start + downstream_length - 1
        else: # gene.strand == '-'
            promoter_start = gene.end - downstream_length + 1
            promoter_end = min(gene.end + upstream_length, len(genome_records[gene.chrom]))

        promoter_seq = str(genome_records[gene.chrom][promoter_start-1:promoter_end])

        # commenting the following two lines don't effect the output size, both are 62703 records 
        #if not set(promoter_seq.upper()) <= {"A", "C", "G", "T"}:
        #    print(f"Invalid sequence for id {gene.id}: {promoter_seq}")

        if gene.strand == '-':
            promoter_seq = str(Seq(promoter_seq).reverse_complement())

        record = SeqRecord(Seq(promoter_seq.upper()), id=gene.id, description="")
        promoters.append(record)
    return promoters

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
    parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.gtf.db file')
    args = parser.parse_args()

    genome_records = Fasta('/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa')

    promoter_sequences = extract_promoter_sequences(args.gencode_db, genome_records)
    SeqIO.write(promoter_sequences, "promoter_sequences_3000_3000.fasta", "fasta")
    #62703 containing ENSG00000198804.2

find motifs in promoters

#!/usr/bin/env python3
#./find_motifs_in_promoters.py
import re
from Bio import SeqIO

def find_motif_frequency_and_distribution(fasta_file, output_file, motif_1="GRGGC", motif_2="GCCYR"):
    motif_1 = motif_1.replace("R", "[AG]").replace("Y", "[CT]")
    motif_2 = motif_2.replace("R", "[AG]").replace("Y", "[CT]")

    promoter_sequences = SeqIO.parse(fasta_file, "fasta")
    #with open(output_file, 'w') as f:
    #    f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Positions\tMotif2_Positions\n")
    #    for promoter in promoter_sequences:
    #        motif1_positions = [m.start() for m in re.finditer(motif_1, str(promoter.seq))]
    #        motif2_positions = [m.start() for m in re.finditer(motif_2, str(promoter.seq))]
    #        line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n"
    #        f.write(line)
    with open(output_file, 'w') as f:
        f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Center_Positions\tMotif2_Center_Positions\n")
        for promoter in promoter_sequences:
            motif1_positions = [(m.start() + len(motif_1) // 2 - 3000) for m in re.finditer(motif_1, str(promoter.seq))]
            motif2_positions = [(m.start() + len(motif_2) // 2 - 3000) for m in re.finditer(motif_2, str(promoter.seq))]
            line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n"
            f.write(line)

if __name__ == "__main__":
    find_motif_frequency_and_distribution("promoter_sequences_3000_3000.fasta", "motif_counts_positions.txt")

R codes

#--3.1. draw plots for all motifs
#DELETE in Kate '[' and ']' in motif_counts_positions.txt
data <- read.table("motif_counts_positions.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)

data <- rename(data, Motif_GRGGC_Count=Motif1_Count, Motif_GCCYR_Count=Motif2_Count, Motif_GRGGC_Center_Positions = Motif1_Center_Positions, Motif_GCCYR_Center_Positions = Motif2_Center_Positions)

data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)

# Histogram of Motif1 counts
png("Histogram_of_Motif_GRGGC_counts.png", width=800, height=800)
hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
dev.off()

# Histogram of Motif2 counts
png("Histogram_of_Motif_GCCYR_counts.png", width=800, height=800)
hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
dev.off()

# # Convert list-like string to actual list of numbers
# data$Motif_GRGGC_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GRGGC_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)]))
# data$Motif_GCCYR_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GCCYR_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)]))
data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")

# Create data frames for each motif
data_motif_GRGGC <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), 
                          Motif = "Motif GRGGC", 
                          Position = unlist(data$Motif_GRGGC_Center_Positions))

data_motif_GCCYR <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), 
                          Motif = "Motif GCCYR", 
                          Position = unlist(data$Motif_GCCYR_Center_Positions))

# Combine motif data frames
data_positions <- rbind(data_motif_GRGGC, data_motif_GCCYR)

# Get positions as numeric values, excluding NAs
motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
# Plot density of Motif1 positions
png("Density_of_Motif_GRGGC_positions.png", width=800, height=800)
plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
dev.off()

# Get positions as numeric values, excluding NAs
motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
# Plot density of Motif2 positions
png("Density_of_Motif_GCCYR_positions.png", width=800, height=800)
plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
dev.off()

#--3.2. split data into two dataframe, one is in which PromoterID occurres in filtered_data$Closest.Gene..Ensemble.ID., another is the remaining
# Create a new data frame for rows in 'data' where the PromoterID is in filtered_data's Closest Gene (Ensemble ID)
data_in_filtered <- data[data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ]
data_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_in_filtered$Motif_GRGGC_Center_Positions, toString)
data_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_in_filtered$Motif_GCCYR_Center_Positions, toString)
write.table(data_in_filtered, file="motifs_in_promoters_with_LT_peak.txt", sep="\t", row.names = FALSE)

# Create another data frame for the remaining rows
data_not_in_filtered <- data[!data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ]
data_not_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_not_in_filtered$Motif_GRGGC_Center_Positions, toString)
data_not_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_not_in_filtered$Motif_GCCYR_Center_Positions, toString)
write.table(data_not_in_filtered, file="motifs_in_promoters_without_LT_peak.txt", sep="\t", row.names = FALSE)

#--3.3. draw plots for motifs_in_promoters_with_LT_peak.txt
#DELETE with Kate '"' in motifs_in_promoters_with_LT_peak.txt
data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)

data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)

# Histogram of Motif_GRGGC counts
png("Histogram_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
dev.off()

# Histogram of Motif_GCCYR counts
png("Histogram_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
dev.off()

data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")

# Create data frames for each motif
data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), 
                          Motif = "Motif GRGGC", 
                          Position = unlist(data$Motif_GRGGC_Center_Positions))

data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), 
                          Motif = "Motif GCCYR", 
                          Position = unlist(data$Motif_GCCYR_Center_Positions))

# Combine motif data frames
data_positions <- rbind(data_motif1, data_motif2)

# Get positions as numeric values, excluding NAs
motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
# Plot density of Motif_GRGGC positions
png("Density_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
dev.off()

# Get positions as numeric values, excluding NAs
motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
# Plot density of Motif_GCCYR positions
png("Density_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
dev.off()

#--3.4. draw plots for motifs_in_promoters_without_LT_peak.txt
#DELETE '"' with Kate in motifs_in_promoters_without_LT_peak.txt
data <- read.table("motifs_in_promoters_without_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)

data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)

# Histogram of Motif_GRGGC counts
png("Histogram_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
dev.off()

# Histogram of Motif_GCCYR counts
png("Histogram_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
dev.off()

data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")

# Create data frames for each motif
data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), 
                          Motif = "Motif GRGGC", 
                          Position = unlist(data$Motif_GRGGC_Center_Positions))

data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), 
                          Motif = "Motif GCCYR", 
                          Position = unlist(data$Motif_GCCYR_Center_Positions))

# Combine motif data frames
data_positions <- rbind(data_motif1, data_motif2)

# Get positions as numeric values, excluding NAs
motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
# Plot density of Motif_GRGGC positions
png("Density_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
dev.off()

# Get positions as numeric values, excluding NAs
motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
# Plot density of Motif_GCCYR positions
png("Density_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
dev.off()

#--3.5. assign each promoter to a prefined class according to Total_Motifs and Avg_Motif_Distance_To_TSS
data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)

# Add a column for the sum of Motif_GRGGC_Count and Motif_GCCYR_Count
data <- data %>%
  mutate(Total_Motifs = Motif_GRGGC_Count + Motif_GCCYR_Count)

# Calculate absolute average Motif_GRGGC distance to TSS
data$Avg_Motif_GRGGC_Distance_To_TSS <- sapply(strsplit(data$Motif_GRGGC_Center_Positions, ", "), 
                                         function(x) round(mean(as.numeric(x), na.rm = TRUE)))
# Calculate absolute average Motif_GCCYR distance to TSS
data$Avg_Motif_GCCYR_Distance_To_TSS <- sapply(strsplit(data$Motif_GCCYR_Center_Positions, ", "), 
                                         function(x) round(mean(as.numeric(x), na.rm = TRUE)))

# Revised function to calculate the mean of two strings of comma-separated numbers
mean_of_strings <- function(str1, str2) {
  # Convert strings of numbers to numeric vectors
  vec1 <- as.numeric(strsplit(str1, split = ",")[[1]])
  vec2 <- as.numeric(strsplit(str2, split = ",")[[1]])
  # Handle cases where vec1 or vec2 is NULL (i.e., when str1 or str2 is an empty string)
  if (is.null(vec1)) {
    vec1 <- numeric(0)
  }
  if (is.null(vec2)) {
    vec2 <- numeric(0)
  }
  # Compute the mean of the two vectors
  return(round(mean(c(vec1, vec2), na.rm = TRUE)))
}
# Apply the function to each row of the dataframe
data$Avg_Motif_Distance_To_TSS <- mapply(mean_of_strings, data$Motif_GRGGC_Center_Positions, data$Motif_GCCYR_Center_Positions)

# Assign classes based on the given rules
#change the class name as string "Class 1", "Class 2" ... in the following code.
data <- data %>%
  mutate(
    Class = case_when(
      Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
      Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000",
      Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
      Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000",
      Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
      Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000"
    )
  )

write.table(data, file="motifs_in_promoters_with_LT_peak_Category.txt", sep="\t", row.names = FALSE)  
#~/Tools/csv2xls-0.4/csv_to_xls.py motifs_in_promoters_with_LT_peak_Category.txt -d$'\t' -o  motifs_in_promoters_with_LT_peak_Category.xls;

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

Defining and Categorizing Promoter Types Based on the 'GRGGC' Motif Frequency and Distance to TSS

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章