Analysis of Peak Distribution in Promoters

gene_x 0 like s 1096 view s

Tags: python, Biopython, genomics

  1. import pprint
  2. import argparse
  3. import matplotlib.pyplot as plt
  4. import pandas as pd
  5. import gffutils
  6. import numpy as np
  7. #db = gffutils.create_db('gencode.v43.annotation.gtf', dbfn='gencode.v43.annotation.db', force=True, keep_order=True, merge_strategy='merge', sort_attribute_values=True)
  8. #python3 plot_peaks4.py peaks.bed gencode.v43.annotation.db
  9. def plot_peak_distribution(peaks_file, gencode_db):
  10. db = gffutils.FeatureDB(gencode_db, keep_order=True)
  11. peaks = pd.read_table(peaks_file, header=None)
  12. peaks.columns = ['chrom', 'start', 'end', 'name', 'score']
  13. tss_positions = []
  14. tss_genes = []
  15. for gene in db.features_of_type('gene'):
  16. if gene.strand == '+':
  17. tss_positions.append(gene.start)
  18. else:
  19. tss_positions.append(gene.end)
  20. tss_genes.append(gene)
  21. peak_tss_distances = []
  22. peak_names = []
  23. closest_genes = []
  24. closest_gene_names = []
  25. closest_strands = []
  26. for _, peak in peaks.iterrows():
  27. #the double forward slash // is used to perform integer division instead of floating-point division
  28. peak_center = (peak['start'] + peak['end']) // 2
  29. closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
  30. distance_to_tss = peak_center - tss_positions[closest_tss_index]
  31. peak_tss_distances.append(distance_to_tss)
  32. peak_names.append(peak['name'])
  33. closest_genes.append(tss_genes[closest_tss_index].id)
  34. #closest_genes.append(tss_genes[closest_tss_index].attributes['gene_name'][0])
  35. if 'gene_name' in tss_genes[closest_tss_index].attributes:
  36. closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
  37. #print(closest_genesymbol)
  38. else:
  39. closest_gene_name = 'N/A' # Set a default value if 'gene_name' attribute is not present
  40. #print(closest_genesymbol)
  41. closest_gene_names.append(closest_gene_name)
  42. #attributes_obj = tss_genes[closest_tss_index].attributes
  43. #attribute_names = attributes_obj.keys()
  44. #print(attribute_names)
  45. #dict_keys(['gene_id', 'gene_type', 'gene_name', 'level', 'hgnc_id', 'tag', 'havana_gene'])
  46. #print(tss_genes[closest_tss_index].attributes['havana_gene'])
  47. #['ENSG00000235519.3'], ['lncRNA'], ['TLCD5'], ['2'], NONE, NONE, ['OTTHUMG00000195005.1']
  48. closest_strands.append(tss_genes[closest_tss_index].strand)
  49. # Save distances and gene info to an Excel file
  50. df = pd.DataFrame({
  51. 'Peak Name': peak_names,
  52. 'Distance to TSS': peak_tss_distances,
  53. 'Closest Gene (Ensemble ID)': closest_genes,
  54. 'Closest Gene (Gene name)': closest_gene_names,
  55. 'Gene Strand': closest_strands
  56. })
  57. df.to_excel('peak_tss_distances.xlsx', index=False)
  58. # Calculate histogram and save bins and counts to a file
  59. counts, bin_edges = np.histogram(peak_tss_distances, bins=1000)
  60. bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
  61. hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
  62. hist_df.to_csv('peak_distribution.csv', index=False)
  63. total_peaks = hist_df['Count'].sum()
  64. with open('peak_distribution.csv', 'a') as f:
  65. f.write(f'\nTotal number of peaks: {total_peaks}')
  66. plt.hist(peak_tss_distances, bins=1000)
  67. plt.xlabel('Distance to TSS')
  68. plt.ylabel('Number of peaks')
  69. plt.title('Distribution of peaks around TSS')
  70. plt.savefig('peak_distribution.png')
  71. plt.show()
  72. if __name__ == "__main__":
  73. parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
  74. parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
  75. parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')
  76. args = parser.parse_args()
  77. plot_peak_distribution(args.peaks_file, args.gencode_db)

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum