Analysis of Peak Distribution in Promoters

gene_x 0 like s 1392 view s

Tags: python, Biopython, genomics

import pprint
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import gffutils
import numpy as np

#db = gffutils.create_db('gencode.v43.annotation.gtf', dbfn='gencode.v43.annotation.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
#python3 plot_peaks4.py peaks.bed gencode.v43.annotation.db

def plot_peak_distribution(peaks_file, gencode_db):
    db = gffutils.FeatureDB(gencode_db, keep_order=True)

    peaks = pd.read_table(peaks_file, header=None)
    peaks.columns = ['chrom', 'start', 'end', 'name', 'score']

    tss_positions = []
    tss_genes = []
    for gene in db.features_of_type('gene'):
        if gene.strand == '+':
            tss_positions.append(gene.start)
        else:
            tss_positions.append(gene.end)
        tss_genes.append(gene)

    peak_tss_distances = []
    peak_names = []
    closest_genes = []
    closest_gene_names = []
    closest_strands = []
    for _, peak in peaks.iterrows():
        #the double forward slash // is used to perform integer division instead of floating-point division
        peak_center = (peak['start'] + peak['end']) // 2
        closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
        distance_to_tss = peak_center - tss_positions[closest_tss_index]
        peak_tss_distances.append(distance_to_tss)
        peak_names.append(peak['name'])
        closest_genes.append(tss_genes[closest_tss_index].id)
        #closest_genes.append(tss_genes[closest_tss_index].attributes['gene_name'][0])
        if 'gene_name' in tss_genes[closest_tss_index].attributes:
            closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
            #print(closest_genesymbol)
        else:
            closest_gene_name = 'N/A'  # Set a default value if 'gene_name' attribute is not present
            #print(closest_genesymbol)
        closest_gene_names.append(closest_gene_name)
        #attributes_obj = tss_genes[closest_tss_index].attributes
        #attribute_names = attributes_obj.keys()
        #print(attribute_names)
        #dict_keys(['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'tag', 'havana_gene'])
        #print(tss_genes[closest_tss_index].attributes['havana_gene'])
        #['ENSG00000235519.3'], ['lncRNA'], ['TLCD5'], ['2'], NONE, NONE, ['OTTHUMG00000195005.1']
        closest_strands.append(tss_genes[closest_tss_index].strand)

    # Save distances and gene info to an Excel file
    df = pd.DataFrame({
        'Peak Name': peak_names,
        'Distance to TSS': peak_tss_distances,
        'Closest Gene (Ensemble ID)': closest_genes,
        'Closest Gene (Gene name)': closest_gene_names,
        'Gene Strand': closest_strands
    })
    df.to_excel('peak_tss_distances.xlsx', index=False)

    # Calculate histogram and save bins and counts to a file
    counts, bin_edges = np.histogram(peak_tss_distances, bins=1000)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
    hist_df.to_csv('peak_distribution.csv', index=False)
    total_peaks = hist_df['Count'].sum()
    with open('peak_distribution.csv', 'a') as f:
        f.write(f'\nTotal number of peaks: {total_peaks}')

    plt.hist(peak_tss_distances, bins=1000)
    plt.xlabel('Distance to TSS')
    plt.ylabel('Number of peaks')
    plt.title('Distribution of peaks around TSS')
    plt.savefig('peak_distribution.png')
    plt.show()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
    parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
    parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')

    args = parser.parse_args()
    plot_peak_distribution(args.peaks_file, args.gencode_db)

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默

最受欢迎文章

  1. Motif Discovery in Biological Sequences: A Comparison of MEME and HOMER
  2. Why Do Significant Gene Lists Change After Adding Additional Conditions in Differential Gene Expression Analysis?
  3. Calling peaks using findPeaks of HOMER
  4. PiCRUST2 Pipeline for Functional Prediction and Pathway Analysis in Metagenomics
  5. Should the inputs for GSVA be normalized or raw?
  6. Updating Human Gene Identifiers using Ensembl BioMart: A Step-by-Step Guide
  7. Kraken2 Installation and Usage Guide
  8. pheatmap vs heatmap.2
  9. Setup conda environments
  10. Guide to Submitting Data to GEO (Gene Expression Omnibus)

最新文章


最多评论文章


推荐相似文章

Defining and Categorizing Promoter Types Based on the 'GRGGC' Motif Frequency, Distribution, and Distance to the Transcription Start Site (TSS)

Evaluating the Proximity of Genomic Features (=integration sites) to Peaks Using a Permutation Test genomic features

Density of motif plots and its statistical tests

Identifying the Nearest Genomic Peaks within Defined Regions


© 2023 XGenes.com Impressum