Defining and Categorizing Promoter Types Based on the 'GRGGC' Motif Frequency and Distance to TSS

gene_x 0 like s 478 view s

Tags: processing, pipeline

  1. generate_promter_sequences

    #!/usr/bin/env python3
    #./1_generate_promoter_sequences.py gencode.v43.annotation.gtf.db
    import gffutils
    from pyfaidx import Fasta
    import argparse
    from Bio import SeqIO
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    
    def extract_promoter_sequences(gencode_db, genome_records, upstream_length=3000, downstream_length=3000):
        db = gffutils.FeatureDB(gencode_db, keep_order=True)
    
        promoters = []
        for gene in db.features_of_type('gene'): #CDS
            if gene.strand == '+':
                promoter_start = max(gene.start - upstream_length, 1)
                promoter_end = gene.start + downstream_length - 1
            else: # gene.strand == '-'
                promoter_start = gene.end - downstream_length + 1
                promoter_end = min(gene.end + upstream_length, len(genome_records[gene.chrom]))
    
            promoter_seq = str(genome_records[gene.chrom][promoter_start-1:promoter_end])
    
            # commenting the following two lines don't effect the output size, both are 62703 records 
            #if not set(promoter_seq.upper()) <= {"A", "C", "G", "T"}:
            #    print(f"Invalid sequence for id {gene.id}: {promoter_seq}")
    
            if gene.strand == '-':
                promoter_seq = str(Seq(promoter_seq).reverse_complement())
    
            record = SeqRecord(Seq(promoter_seq.upper()), id=gene.id, description="")
            promoters.append(record)
        return promoters
    
    if __name__ == "__main__":
        parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
        parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.gtf.db file')
        args = parser.parse_args()
    
        genome_records = Fasta('/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa')
    
        promoter_sequences = extract_promoter_sequences(args.gencode_db, genome_records)
        SeqIO.write(promoter_sequences, "promoter_sequences_3000_3000.fasta", "fasta")
        #62703 containing ENSG00000198804.2
    
  2. find motifs in promoters

    #!/usr/bin/env python3
    #./find_motifs_in_promoters.py
    import re
    from Bio import SeqIO
    
    def find_motif_frequency_and_distribution(fasta_file, output_file, motif_1="GRGGC", motif_2="GCCYR"):
        motif_1 = motif_1.replace("R", "[AG]").replace("Y", "[CT]")
        motif_2 = motif_2.replace("R", "[AG]").replace("Y", "[CT]")
    
        promoter_sequences = SeqIO.parse(fasta_file, "fasta")
        #with open(output_file, 'w') as f:
        #    f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Positions\tMotif2_Positions\n")
        #    for promoter in promoter_sequences:
        #        motif1_positions = [m.start() for m in re.finditer(motif_1, str(promoter.seq))]
        #        motif2_positions = [m.start() for m in re.finditer(motif_2, str(promoter.seq))]
        #        line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n"
        #        f.write(line)
        with open(output_file, 'w') as f:
            f.write("PromoterID\tMotif1_Count\tMotif2_Count\tMotif1_Center_Positions\tMotif2_Center_Positions\n")
            for promoter in promoter_sequences:
                motif1_positions = [(m.start() + len(motif_1) // 2 - 3000) for m in re.finditer(motif_1, str(promoter.seq))]
                motif2_positions = [(m.start() + len(motif_2) // 2 - 3000) for m in re.finditer(motif_2, str(promoter.seq))]
                line = f"{promoter.id}\t{len(motif1_positions)}\t{len(motif2_positions)}\t{motif1_positions}\t{motif2_positions}\n"
                f.write(line)
    
    if __name__ == "__main__":
        find_motif_frequency_and_distribution("promoter_sequences_3000_3000.fasta", "motif_counts_positions.txt")
    
  3. R codes

    #--3.1. draw plots for all motifs
    #DELETE in Kate '[' and ']' in motif_counts_positions.txt
    data <- read.table("motif_counts_positions.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    
    data <- rename(data, Motif_GRGGC_Count=Motif1_Count, Motif_GCCYR_Count=Motif2_Count, Motif_GRGGC_Center_Positions = Motif1_Center_Positions, Motif_GCCYR_Center_Positions = Motif2_Center_Positions)
    
    data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
    data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
    
    # Histogram of Motif1 counts
    png("Histogram_of_Motif_GRGGC_counts.png", width=800, height=800)
    hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
    dev.off()
    
    # Histogram of Motif2 counts
    png("Histogram_of_Motif_GCCYR_counts.png", width=800, height=800)
    hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
    dev.off()
    
    # # Convert list-like string to actual list of numbers
    # data$Motif_GRGGC_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GRGGC_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)]))
    # data$Motif_GCCYR_Center_Positions <- sapply(strsplit(gsub("[\\[\\]\\s+]", "", data$Motif_GCCYR_Center_Positions), ","), function(x) as.numeric(x[nzchar(x)]))
    data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
    data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")
    
    # Create data frames for each motif
    data_motif_GRGGC <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), 
                              Motif = "Motif GRGGC", 
                              Position = unlist(data$Motif_GRGGC_Center_Positions))
    
    data_motif_GCCYR <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), 
                              Motif = "Motif GCCYR", 
                              Position = unlist(data$Motif_GCCYR_Center_Positions))
    
    # Combine motif data frames
    data_positions <- rbind(data_motif_GRGGC, data_motif_GCCYR)
    
    # Get positions as numeric values, excluding NAs
    motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
    # Plot density of Motif1 positions
    png("Density_of_Motif_GRGGC_positions.png", width=800, height=800)
    plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
    dev.off()
    
    # Get positions as numeric values, excluding NAs
    motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
    # Plot density of Motif2 positions
    png("Density_of_Motif_GCCYR_positions.png", width=800, height=800)
    plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
    dev.off()
    
    #--3.2. split data into two dataframe, one is in which PromoterID occurres in filtered_data$Closest.Gene..Ensemble.ID., another is the remaining
    # Create a new data frame for rows in 'data' where the PromoterID is in filtered_data's Closest Gene (Ensemble ID)
    data_in_filtered <- data[data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ]
    data_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_in_filtered$Motif_GRGGC_Center_Positions, toString)
    data_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_in_filtered$Motif_GCCYR_Center_Positions, toString)
    write.table(data_in_filtered, file="motifs_in_promoters_with_LT_peak.txt", sep="\t", row.names = FALSE)
    
    # Create another data frame for the remaining rows
    data_not_in_filtered <- data[!data$PromoterID %in% filtered_data$Closest.Gene..Ensemble.ID., ]
    data_not_in_filtered$Motif_GRGGC_Center_Positions <- sapply(data_not_in_filtered$Motif_GRGGC_Center_Positions, toString)
    data_not_in_filtered$Motif_GCCYR_Center_Positions <- sapply(data_not_in_filtered$Motif_GCCYR_Center_Positions, toString)
    write.table(data_not_in_filtered, file="motifs_in_promoters_without_LT_peak.txt", sep="\t", row.names = FALSE)
    
    #--3.3. draw plots for motifs_in_promoters_with_LT_peak.txt
    #DELETE with Kate '"' in motifs_in_promoters_with_LT_peak.txt
    data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    
    data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
    data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
    
    # Histogram of Motif_GRGGC counts
    png("Histogram_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
    hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
    dev.off()
    
    # Histogram of Motif_GCCYR counts
    png("Histogram_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
    hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
    dev.off()
    
    data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
    data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")
    
    # Create data frames for each motif
    data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), 
                              Motif = "Motif GRGGC", 
                              Position = unlist(data$Motif_GRGGC_Center_Positions))
    
    data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), 
                              Motif = "Motif GCCYR", 
                              Position = unlist(data$Motif_GCCYR_Center_Positions))
    
    # Combine motif data frames
    data_positions <- rbind(data_motif1, data_motif2)
    
    # Get positions as numeric values, excluding NAs
    motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
    # Plot density of Motif_GRGGC positions
    png("Density_of_Motif_GRGGC_in_promoters_with_LT_peak.png", width=800, height=800)
    plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
    dev.off()
    
    # Get positions as numeric values, excluding NAs
    motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
    # Plot density of Motif_GCCYR positions
    png("Density_of_Motif_GCCYR_in_promoters_with_LT_peak.png", width=800, height=800)
    plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
    dev.off()
    
    #--3.4. draw plots for motifs_in_promoters_without_LT_peak.txt
    #DELETE '"' with Kate in motifs_in_promoters_without_LT_peak.txt
    data <- read.table("motifs_in_promoters_without_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    
    data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
    data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
    
    # Histogram of Motif_GRGGC counts
    png("Histogram_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
    hist(data$Motif_GRGGC_Count, main="Motif GRGGC Counts", xlab="Counts")
    dev.off()
    
    # Histogram of Motif_GCCYR counts
    png("Histogram_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
    hist(data$Motif_GCCYR_Count, main="Motif GCCYR Counts", xlab="Counts")
    dev.off()
    
    data$Motif_GRGGC_Center_Positions <- strsplit(as.character(data$Motif_GRGGC_Center_Positions), ",")
    data$Motif_GCCYR_Center_Positions <- strsplit(as.character(data$Motif_GCCYR_Center_Positions), ",")
    
    # Create data frames for each motif
    data_motif1 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GRGGC_Count), 
                              Motif = "Motif GRGGC", 
                              Position = unlist(data$Motif_GRGGC_Center_Positions))
    
    data_motif2 <- data.frame(PromoterID = rep(data$PromoterID, data$Motif_GCCYR_Count), 
                              Motif = "Motif GCCYR", 
                              Position = unlist(data$Motif_GCCYR_Center_Positions))
    
    # Combine motif data frames
    data_positions <- rbind(data_motif1, data_motif2)
    
    # Get positions as numeric values, excluding NAs
    motif1_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GRGGC"])
    # Plot density of Motif_GRGGC positions
    png("Density_of_Motif_GRGGC_in_promoters_without_LT_peak.png", width=800, height=800)
    plot(density(motif1_positions, na.rm = TRUE), main="Density Plot of Motif GRGGC Positions", xlab="Position")
    dev.off()
    
    # Get positions as numeric values, excluding NAs
    motif2_positions <- as.numeric(data_positions$Position[data_positions$Motif == "Motif GCCYR"])
    # Plot density of Motif_GCCYR positions
    png("Density_of_Motif_GCCYR_in_promoters_without_LT_peak.png", width=800, height=800)
    plot(density(motif2_positions, na.rm = TRUE), main="Density Plot of Motif GCCYR Positions", xlab="Position")
    dev.off()
    
    #--3.5. assign each promoter to a prefined class according to Total_Motifs and Avg_Motif_Distance_To_TSS
    data <- read.table("motifs_in_promoters_with_LT_peak.txt", sep="\t", header=TRUE, stringsAsFactors=FALSE)
    data$Motif_GRGGC_Count <- as.numeric(data$Motif_GRGGC_Count)
    data$Motif_GCCYR_Count <- as.numeric(data$Motif_GCCYR_Count)
    
    # Add a column for the sum of Motif_GRGGC_Count and Motif_GCCYR_Count
    data <- data %>%
      mutate(Total_Motifs = Motif_GRGGC_Count + Motif_GCCYR_Count)
    
    # Calculate absolute average Motif_GRGGC distance to TSS
    data$Avg_Motif_GRGGC_Distance_To_TSS <- sapply(strsplit(data$Motif_GRGGC_Center_Positions, ", "), 
                                             function(x) round(mean(as.numeric(x), na.rm = TRUE)))
    # Calculate absolute average Motif_GCCYR distance to TSS
    data$Avg_Motif_GCCYR_Distance_To_TSS <- sapply(strsplit(data$Motif_GCCYR_Center_Positions, ", "), 
                                             function(x) round(mean(as.numeric(x), na.rm = TRUE)))
    
    # Revised function to calculate the mean of two strings of comma-separated numbers
    mean_of_strings <- function(str1, str2) {
      # Convert strings of numbers to numeric vectors
      vec1 <- as.numeric(strsplit(str1, split = ",")[[1]])
      vec2 <- as.numeric(strsplit(str2, split = ",")[[1]])
      # Handle cases where vec1 or vec2 is NULL (i.e., when str1 or str2 is an empty string)
      if (is.null(vec1)) {
        vec1 <- numeric(0)
      }
      if (is.null(vec2)) {
        vec2 <- numeric(0)
      }
      # Compute the mean of the two vectors
      return(round(mean(c(vec1, vec2), na.rm = TRUE)))
    }
    # Apply the function to each row of the dataframe
    data$Avg_Motif_Distance_To_TSS <- mapply(mean_of_strings, data$Motif_GRGGC_Center_Positions, data$Motif_GCCYR_Center_Positions)
    
    # Assign classes based on the given rules
    #change the class name as string "Class 1", "Class 2" ... in the following code.
    data <- data %>%
      mutate(
        Class = case_when(
          Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
          Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "Total_Motifs < 50 & abs(Avg_Motif_Distance_To_TSS) > 1000",
          Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
          Total_Motifs >= 50 & Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "50 <= Total_Motifs < 100 & abs(Avg_Motif_Distance_To_TSS) > 1000",
          Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) <= 1000",
          Total_Motifs >= 100 & Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000 ~ "100 <= Total_Motifs < 150 & abs(Avg_Motif_Distance_To_TSS) > 1000"
        )
      )
    
    write.table(data, file="motifs_in_promoters_with_LT_peak_Category.txt", sep="\t", row.names = FALSE)  
    #~/Tools/csv2xls-0.4/csv_to_xls.py motifs_in_promoters_with_LT_peak_Category.txt -d$'\t' -o  motifs_in_promoters_with_LT_peak_Category.xls;
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum