Run GAMOLA2 on MT880870-MT880872

huang 0 like s 1052 view s

Tags: Tools

  1. #-- view process.py --
  2. with open("MT880872_CDS.fasta", "r") as f:
  3. for line in f:
  4. if line.startswith(">"):
  5. header_words = line.strip().split("locus_tag=")
  6. third_word = header_words[1]
  7. third_word_ = third_word.strip().split("]")
  8. third_word__ = third_word_[0]
  9. print(">"+third_word__)
  10. else:
  11. print(line.strip())
  12. #input_file: MT880870_.fasta as pan_genome_reference_.fa; MT880870_.fasta is modified from MT880870.fasta by simplying the fasta headers.
  13. echo ">MT880870_genes" > MT880870_genes.fa;
  14. merge_seq.py MT880870_.fasta >> MT880870_genes.fa;
  15. samtools faidx MT880870_genes.fa
  16. bioawk -c fastx '{ print $name, length($seq) }' < MT880870_.fasta > length.txt
  17. generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880870_genes.fa.combined;
  18. cp MT880870_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences
  19. python process.py > MT880871_.fasta
  20. echo ">MT880871_genes" > MT880871_genes.fa;
  21. merge_seq.py MT880871_.fasta >> MT880871_genes.fa;
  22. samtools faidx MT880871_genes.fa
  23. bioawk -c fastx '{ print $name, length($seq) }' < MT880871_.fasta > length.txt
  24. generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880871_genes.fa.combined;
  25. cp MT880871_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences
  26. python process.py > MT880872_.fasta
  27. echo ">MT880872_genes" > MT880872_genes.fa;
  28. merge_seq.py MT880872_.fasta >> MT880872_genes.fa;
  29. samtools faidx MT880872_genes.fa
  30. bioawk -c fastx '{ print $name, length($seq) }' < MT880872_.fasta > length.txt
  31. generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880872_genes.fa.combined;
  32. cp MT880872_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences
  33. #---- 4 ----
  34. cd /media/jhuang/Titisee/GAMOLA2
  35. #WARNING: !!swissprot as default database, manually choose nr as Blast_db --> too slow, better choose swissprot, it is quick!!
  36. ./Gamola.pl #No Glimmer model and Critica database due to self-extracted ORF; choosing nr.pal or swissprot.pal as Blast_db; COG2014 as COG_db; Pfam-A.hmm as Pfam_db; No Rfam_db; TIGRFAMS_15.0_HMM.LIB as TIGRfam_db
  37. /media/jhuang/Titisee/GAMOLA2/Results/COG_results
  38. #grep "Class: ," roary_182s_95.fa_COG_*
  39. for sample in 10601 10710 11187 11377 12474 12504 1520 1551 2092 2160 2467 289 3455 4694 5862 5863 6166 6464 7618 8007 8406 8784 9157 9373 953; do
  40. sed -i -e 's/Class: ,/Class: None, None/g' roary_182s_95.fa_COG_${sample}
  41. done
  42. ./Gamola.pl
  43. #---- 5 ---- update locustag
  44. cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880870_genes.fa/MT880870_genes.fa.gb ./
  45. awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880870_genes.fa.gb
  46. sed -i 's/<!-- /g' file2
  47. sed -i 's/^ //g' file2
  48. cat file1 file2 --> MT880870_genes_.fa.gb
  49. iconv -t UTF-8 -f Windows-1252 MT880870_genes_.fa.gb
  50. #89600270 Jan 26 13:44 roary_182s_95__.fa.gb
  51. #-- if ran [3.2 selected]--
  52. python ~/Scripts/update_locustag.py MT880870_genes_.fa.gb MT880870_.fasta.fai > MT880870_genes__.fa.gb
  53. > MT880870_genes__.fa.gb #clean old *__.fa.gb
  54. #---- 6.6 ----
  55. cut -d$'\t' -f1 MT880870_.fasta.fai > MT880870_f1
  56. process_gamola_gb_plus.py MT880870_f1 MT880870_genes__.fa.gb > annotated_MT880870.txt #!!
  57. Some fields may be wrong.
  58. BiopythonParserWarning)
  59. /usr/local/lib/python2.7/dist-packages/Bio/GenBank/__init__.py:1047: BiopythonParserWarning: Ignoring invalid location: '1186620..1185596'
  60. sed -i -e 's/1186620\.\.1185596/1185596\.\.1186620/g' roary__.fa.gb
  61. #DEBUG "group_12676" -> "group_12667"
  62. #---- ADD DNA-Sequences add the end of table as the last column ----
  63. cut -d',' -f1-1 annotated_MT880870.txt > get_seq_ORF.sh
  64. #extend the file get_seq_ORF.sh to extract all DNA-sequences
  65. #mv the bash file under DIR of MT880870_.fasta.fai
  66. #samtools faidx MT880870_.fasta "PLKLOBMN_00001" > seq_ORF.fasta
  67. #samtools faidx MT880870_.fasta "PLKLOBMN_00002" >> seq_ORF.fasta
  68. #or in the case simply "cp MT880870_.fasta seq_ORF.fasta"
  69. awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < seq_ORF.fasta > seq_ORF_.fasta
  70. grep -v ">" seq_ORF_.fasta > seq_ORF__.txt
  71. #add "Sequence" to the first line of seq_ORF__.txt
  72. paste -d',' annotated_MT880870.txt seq_ORF__.txt > annotated_MT880870_.txt
  73. #---- repeat 5-->6.6 for MT880871 and MT880872 --------------
  74. cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880871_genes.fa/MT880871_genes.fa.gb ./
  75. awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880871_genes.fa.gb
  76. sed -i 's/<!-- /g' file2
  77. sed -i 's/^ //g' file2
  78. cat file1 file2 --> MT880871_genes_.fa.gb
  79. iconv -t UTF-8 -f Windows-1252 MT880871_genes_.fa.gb
  80. python ~/Scripts/update_locustag.py MT880871_genes_.fa.gb MT880871_.fasta.fai > MT880871_genes__.fa.gb
  81. #remove the last line 'ORIGIN'
  82. cut -d$'\t' -f1 MT880871_.fasta.fai > MT880871_f1
  83. process_gamola_gb_plus.py MT880871_f1 MT880871_genes__.fa.gb > annotated_MT880871.txt
  84. cp MT880871_.fasta seq_ORF_71.fasta
  85. awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < seq_ORF_71.fasta > seq_ORF_71_.fasta
  86. grep -v ">" seq_ORF_71_.fasta > seq_ORF_71__.txt
  87. #add 'Sequence' in the first line in seq_ORF_71__.txt
  88. paste -d',' annotated_MT880871.txt seq_ORF_71__.txt > annotated_MT880871_.txt
  89. #--
  90. cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880872_genes.fa/MT880872_genes.fa.gb ./
  91. awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880872_genes.fa.gb
  92. sed -i 's/<!-- /g' file2
  93. sed -i 's/^ //g' file2
  94. cat file1 file2 --> MT880872_genes_.fa.gb
  95. iconv -t UTF-8 -f Windows-1252 MT880872_genes_.fa.gb
  96. python ~/Scripts/update_locustag.py MT880872_genes_.fa.gb MT880872_.fasta.fai > MT880872_genes__.fa.gb
  97. #remove the last line 'ORIGIN'
  98. cut -d$'\t' -f1 MT880872_.fasta.fai > MT880872_f1
  99. process_gamola_gb_plus.py MT880872_f1 MT880872_genes__.fa.gb > annotated_MT880872.txt
  100. cp MT880872_.fasta seq_ORF_72.fasta
  101. awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' < seq_ORF_72.fasta > seq_ORF_72_.fasta
  102. grep -v ">" seq_ORF_72_.fasta > seq_ORF_72__.txt
  103. #add 'Sequence' in the first line in seq_ORF_72__.txt
  104. paste -d',' annotated_MT880872.txt seq_ORF_72__.txt > annotated_MT880872_.txt
  105. mv annotated_MT880870_.txt annotated__MT880870_.txt
  106. #~/Tools/csv2xls-0.4/csv_to_xls.py annotated__MT880870_.txt annotated_MT880871_.txt annotated_MT880872_.txt -d',' -o annotated_MT880870-MT880872.xls

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum