Analysis of Peak Distribution in Promoters

gene_x 0 like s 493 view s

Tags: python, Biopython, genomics

import pprint
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import gffutils
import numpy as np

#db = gffutils.create_db('gencode.v43.annotation.gtf', dbfn='gencode.v43.annotation.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
#python3 plot_peaks4.py peaks.bed gencode.v43.annotation.db

def plot_peak_distribution(peaks_file, gencode_db):
    db = gffutils.FeatureDB(gencode_db, keep_order=True)

    peaks = pd.read_table(peaks_file, header=None)
    peaks.columns = ['chrom', 'start', 'end', 'name', 'score']

    tss_positions = []
    tss_genes = []
    for gene in db.features_of_type('gene'):
        if gene.strand == '+':
            tss_positions.append(gene.start)
        else:
            tss_positions.append(gene.end)
        tss_genes.append(gene)

    peak_tss_distances = []
    peak_names = []
    closest_genes = []
    closest_gene_names = []
    closest_strands = []
    for _, peak in peaks.iterrows():
        #the double forward slash // is used to perform integer division instead of floating-point division
        peak_center = (peak['start'] + peak['end']) // 2
        closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
        distance_to_tss = peak_center - tss_positions[closest_tss_index]
        peak_tss_distances.append(distance_to_tss)
        peak_names.append(peak['name'])
        closest_genes.append(tss_genes[closest_tss_index].id)
        #closest_genes.append(tss_genes[closest_tss_index].attributes['gene_name'][0])
        if 'gene_name' in tss_genes[closest_tss_index].attributes:
            closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
            #print(closest_genesymbol)
        else:
            closest_gene_name = 'N/A'  # Set a default value if 'gene_name' attribute is not present
            #print(closest_genesymbol)
        closest_gene_names.append(closest_gene_name)
        #attributes_obj = tss_genes[closest_tss_index].attributes
        #attribute_names = attributes_obj.keys()
        #print(attribute_names)
        #dict_keys(['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'tag', 'havana_gene'])
        #print(tss_genes[closest_tss_index].attributes['havana_gene'])
        #['ENSG00000235519.3'], ['lncRNA'], ['TLCD5'], ['2'], NONE, NONE, ['OTTHUMG00000195005.1']
        closest_strands.append(tss_genes[closest_tss_index].strand)

    # Save distances and gene info to an Excel file
    df = pd.DataFrame({
        'Peak Name': peak_names,
        'Distance to TSS': peak_tss_distances,
        'Closest Gene (Ensemble ID)': closest_genes,
        'Closest Gene (Gene name)': closest_gene_names,
        'Gene Strand': closest_strands
    })
    df.to_excel('peak_tss_distances.xlsx', index=False)

    # Calculate histogram and save bins and counts to a file
    counts, bin_edges = np.histogram(peak_tss_distances, bins=1000)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
    hist_df.to_csv('peak_distribution.csv', index=False)
    total_peaks = hist_df['Count'].sum()
    with open('peak_distribution.csv', 'a') as f:
        f.write(f'\nTotal number of peaks: {total_peaks}')

    plt.hist(peak_tss_distances, bins=1000)
    plt.xlabel('Distance to TSS')
    plt.ylabel('Number of peaks')
    plt.title('Distribution of peaks around TSS')
    plt.savefig('peak_distribution.png')
    plt.show()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
    parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
    parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')

    args = parser.parse_args()
    plot_peak_distribution(args.peaks_file, args.gencode_db)

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum