Run GAMOLA2 on MT880870-MT880872

huang 0 like s 673 view s

Tags: Tools

#-- view process.py --
with open("MT880872_CDS.fasta", "r") as f:
    for line in f:
        if line.startswith(">"):
            header_words = line.strip().split("locus_tag=")
            third_word = header_words[1]
            third_word_ = third_word.strip().split("]")
            third_word__ = third_word_[0]
            print(">"+third_word__)
        else:
            print(line.strip())

    #input_file: MT880870_.fasta as pan_genome_reference_.fa; MT880870_.fasta is modified from MT880870.fasta by simplying the fasta headers.
    echo ">MT880870_genes" > MT880870_genes.fa;
    merge_seq.py MT880870_.fasta >> MT880870_genes.fa;
    samtools faidx MT880870_genes.fa
    bioawk -c fastx '{ print $name, length($seq) }' < MT880870_.fasta > length.txt 
    generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880870_genes.fa.combined;
    cp MT880870_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences

    python process.py > MT880871_.fasta
    echo ">MT880871_genes" > MT880871_genes.fa;
    merge_seq.py MT880871_.fasta >> MT880871_genes.fa;
    samtools faidx MT880871_genes.fa
    bioawk -c fastx '{ print $name, length($seq) }' < MT880871_.fasta > length.txt 
    generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880871_genes.fa.combined;
    cp MT880871_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences

    python process.py > MT880872_.fasta
    echo ">MT880872_genes" > MT880872_genes.fa;
    merge_seq.py MT880872_.fasta >> MT880872_genes.fa;
    samtools faidx MT880872_genes.fa
    bioawk -c fastx '{ print $name, length($seq) }' < MT880872_.fasta > length.txt 
    generate_gene_model.py length.txt > /media/jhuang/Titisee/GAMOLA2/Results/gene_models/MT880872_genes.fa.combined;
    cp MT880872_genes.fa /media/jhuang/Titisee/GAMOLA2/Input_sequences

    #---- 4 ----
    cd /media/jhuang/Titisee/GAMOLA2
    #WARNING: !!swissprot as default database, manually choose nr as Blast_db --> too slow, better choose swissprot, it is quick!!
    ./Gamola.pl    #No Glimmer model and Critica database due to self-extracted ORF; choosing nr.pal or swissprot.pal as Blast_db; COG2014 as COG_db; Pfam-A.hmm as Pfam_db; No Rfam_db; TIGRFAMS_15.0_HMM.LIB as TIGRfam_db
    /media/jhuang/Titisee/GAMOLA2/Results/COG_results
    #grep "Class: ," roary_182s_95.fa_COG_*
    for sample in 10601 10710 11187 11377 12474 12504  1520 1551 2092 2160 2467   289 3455 4694 5862 5863 6166 6464 7618 8007 8406 8784 9157 9373 953; do
      sed -i -e 's/Class: ,/Class: None, None/g' roary_182s_95.fa_COG_${sample}
    done
    ./Gamola.pl

    #---- 5 ---- update locustag
    cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880870_genes.fa/MT880870_genes.fa.gb ./
    awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880870_genes.fa.gb
    sed -i 's/<!-- /g' file2
    sed -i 's/^ //g' file2
    cat file1 file2 --> MT880870_genes_.fa.gb
    iconv -t UTF-8 -f Windows-1252 MT880870_genes_.fa.gb
    #89600270 Jan 26 13:44 roary_182s_95__.fa.gb
    #-- if ran [3.2 selected]--
    python ~/Scripts/update_locustag.py MT880870_genes_.fa.gb MT880870_.fasta.fai > MT880870_genes__.fa.gb
    > MT880870_genes__.fa.gb    #clean old *__.fa.gb

    #---- 6.6 ----
    cut -d$'\t' -f1 MT880870_.fasta.fai > MT880870_f1
    process_gamola_gb_plus.py MT880870_f1 MT880870_genes__.fa.gb > annotated_MT880870.txt  #!!

    Some fields may be wrong.
      BiopythonParserWarning)
    /usr/local/lib/python2.7/dist-packages/Bio/GenBank/__init__.py:1047: BiopythonParserWarning: Ignoring invalid location: '1186620..1185596'
    sed -i -e 's/1186620\.\.1185596/1185596\.\.1186620/g' roary__.fa.gb
    #DEBUG "group_12676" -> "group_12667"

    #---- ADD DNA-Sequences add the end of table as the last column ----
    cut -d',' -f1-1 annotated_MT880870.txt > get_seq_ORF.sh
    #extend the file get_seq_ORF.sh to extract all DNA-sequences
    #mv the bash file under DIR of MT880870_.fasta.fai 
    #samtools faidx MT880870_.fasta "PLKLOBMN_00001" > seq_ORF.fasta
    #samtools faidx MT880870_.fasta "PLKLOBMN_00002" >> seq_ORF.fasta
    #or in the case simply "cp MT880870_.fasta seq_ORF.fasta"
    awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < seq_ORF.fasta > seq_ORF_.fasta
    grep -v ">" seq_ORF_.fasta > seq_ORF__.txt
    #add "Sequence" to the first line of seq_ORF__.txt
    paste -d',' annotated_MT880870.txt seq_ORF__.txt > annotated_MT880870_.txt

    #---- repeat 5-->6.6 for MT880871 and MT880872 --------------
    cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880871_genes.fa/MT880871_genes.fa.gb ./
    awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880871_genes.fa.gb
    sed -i 's/<!-- /g' file2
    sed -i 's/^ //g' file2
    cat file1 file2 --> MT880871_genes_.fa.gb
    iconv -t UTF-8 -f Windows-1252 MT880871_genes_.fa.gb
    python ~/Scripts/update_locustag.py MT880871_genes_.fa.gb MT880871_.fasta.fai > MT880871_genes__.fa.gb
    #remove the last line 'ORIGIN'
    cut -d$'\t' -f1 MT880871_.fasta.fai > MT880871_f1
    process_gamola_gb_plus.py MT880871_f1 MT880871_genes__.fa.gb > annotated_MT880871.txt

    cp MT880871_.fasta seq_ORF_71.fasta
    awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < seq_ORF_71.fasta > seq_ORF_71_.fasta
    grep -v ">" seq_ORF_71_.fasta > seq_ORF_71__.txt
    #add 'Sequence' in the first line in seq_ORF_71__.txt
    paste -d',' annotated_MT880871.txt seq_ORF_71__.txt > annotated_MT880871_.txt

    #--
    cp /media/jhuang/Titisee/GAMOLA2/Consolidated_results/MT880872_genes.fa/MT880872_genes.fa.gb ./
    awk '{print $0 "ORIGIN"> "file" NR}' RS='ORIGIN' MT880872_genes.fa.gb
    sed -i 's/<!-- /g' file2
    sed -i 's/^ //g' file2
    cat file1 file2 --> MT880872_genes_.fa.gb
    iconv -t UTF-8 -f Windows-1252 MT880872_genes_.fa.gb
    python ~/Scripts/update_locustag.py MT880872_genes_.fa.gb MT880872_.fasta.fai > MT880872_genes__.fa.gb
    #remove the last line 'ORIGIN'
    cut -d$'\t' -f1 MT880872_.fasta.fai > MT880872_f1
    process_gamola_gb_plus.py MT880872_f1 MT880872_genes__.fa.gb > annotated_MT880872.txt

    cp MT880872_.fasta seq_ORF_72.fasta
    awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' < seq_ORF_72.fasta > seq_ORF_72_.fasta
    grep -v ">" seq_ORF_72_.fasta > seq_ORF_72__.txt
    #add 'Sequence' in the first line in seq_ORF_72__.txt
    paste -d',' annotated_MT880872.txt seq_ORF_72__.txt > annotated_MT880872_.txt

    mv annotated_MT880870_.txt annotated__MT880870_.txt
    #~/Tools/csv2xls-0.4/csv_to_xls.py annotated__MT880870_.txt annotated_MT880871_.txt annotated_MT880872_.txt -d',' -o annotated_MT880870-MT880872.xls

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum