How to run the pipeline vrap?

gene_x 0 like s 1240 view s

Tags: processing, virus, pipeline

download viral database

cp download_db.py down_db.py.backup
sed -i -e 's/39733/1396/g' download_db.py
#NOTE the download_db.py cannot run alone, rather than via "vrap.py -u"
(damian) jhuang@hamburg:~/DATA/Data_Tam_Acinetobacter_baumannii$ ./vrap/vrap.py -u
mv vrap/database/viral_db vrap/database/viral_db_Bacillus_cereus

run vrap.py

#--host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa 
vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/ATCC19606.fasta -n /media/jhuang/Titisee/GAMOLA2/Blast_db/nt -a /media/jhuang/Titisee/GAMOLA2/Blast_db/nr -t 4
#vrap/vrap.py -1 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R1_001.fastq.gz -2 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R2_001.fastq.gz -o BovHepV_Bulgaria_19_on_Hepacivirus --host=/home/jhuang/REFs/Bos_taurus.ARS-UCD1.2.dna.toplevel.fa  -t 3
#IMPORTANT, it doesn't work, since the update will always downlod the repeated sequences causing ERROR! It needs to delete it as follows!!

#In summary: --(optional) host --> bowtie2/host (bowtie_database for host-substraction) + blast/db/host (in nt_dbs, custom blastn database before downloaded_db!)
#            --(optional) virus --> blast/db/virus (in nt_dbs, custom blastn database before downloaded_db!)
#            --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (nt_dbs)
#                                                                                        --> database/viral_db/viral_protein (prot_db)
#            --(optional) nt+nr
#Therefore, the results substracted host should not need the blast with host, however it it in nt_dbs!

#Under (vrap) jhuang@hamm:~/DATA/Data_Tam_Acinetobacter_baumannii$ 
#DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter
#DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py

#A6: --host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa  --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta
vrap/vrap.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

#A10　
#START checking blast db: no host and with virus --> mapping to 5 dbs: blast/db/virus, viral_nucleotide, viral_protein, nt, nr
vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta   -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

#A12
vrap/vrap.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

#A19 
vrap/vrap.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

#START VrAP
#START checking blast db
#START lighter
#START estimate k-mer size
#k-mer size: 5964136
#/home/jhuang/miniconda3/envs/vrap/bin/lighter -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -K 20 5964136 -od /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter -t 55 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log
#START flash
#/home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_1.cor.fq.gz /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_2.cor.fq.gz -o flash -d /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/flash -M 1000 >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log
#START spades

vrap.py (code)

#!/usr/bin/env python3

from orf import ORF
from optparse import OptionParser
import os
import glob
from Bio import SeqIO
from pathlib import Path
# see blast.py
# .ncbirc --> BLASTDB=/media/jhuang/Titisee/GAMOLA2/Blast_db/ --> /mnt/Samsung_T5/Blast_db/
from blast import Blast
from hmmer import HMMER
from bowtie import Bowtie
# download_db.py only for txid2697049 [Severe acute respiratory syndrome coronavirus 2]
from download_db import Database
import glob
import logging
from math import exp
from scipy.optimize import brentq

#old = "_2010"
old = ""

class VrAP:
    """ VrAP """

    def __init__ (self):
        """ Class initialiser """
        self.version = "1.0.5"
        option = self.option_parser()
        self.path = os.path.dirname(os.path.realpath(__file__))

        self.paired_end = True
        self.p1  = option.p1
        self.p2  = option.p2
        self.single = ""
        self.cor1 = self.p1
        self.cor2 = self.p2
        self.cpu = option.cpu
        self.contigs = ""
        self.contig_length = option.clength
        self.host = option.host
        self.virus = option.virus
        self.nt = option.nt
        self.nr = option.nr
        self.bt2idx = option.bt2idx
        self.update = option.update
        self.gbt2 = option.gbowtie
        self.pre = option.pre
        self.assembly = option.assembly
        self.noblast = option.noblast

        if option.out!=None:
            self.out = os.path.abspath(option.out)
        else:
            self.out = "./"

        if self.update:
            self.update_database()

        if self.cpu==None:
            self.cpu = 1

        if self.p2==None:
            self.single = self.p1
            self.paired_end = False

        self.create_output_dir()

        open("{}/vrap.log".format(self.out),"w").close()
        self.log("START VrAP")

        with open("{}/vrap.log".format(self.out),"a") as log_opt:
            for opt, value in option.__dict__.items():
                log_opt.write("{}:\t{}\n".format(opt,value))

        if self.pre:
            self.log("START only read correction and assembly!")

        self.check_input()

    def check_input (self):
        """ check input """

        bt2idx = self.bt2idx
        if self.bt2idx!=None:
            bt2idx = self.bt2idx+".1.bt2"

        input_files = [self.p1,self.p2,self.host,self.virus,bt2idx]

        for f in input_files:
            #print(f)
            if f != None:
                if not os.path.exists(f):

                    print("{} don't exists\nPlease check input files\nVrAP quit\n".format(f))
                    quit()

        if self.p1 == None:
            self.parser.print_help()
            print("-1/--p1 is mandatory\nPlease check input file\nVrAP quit\n")
            quit()

        #EXPLANATION: in total, mapping to 4 dbs: viral_nucleotide, viral_protein, nt, nr
        dbs = ["{path}/database/viral_db/viral_nucleotide".format(path=self.path),"{path}/database/viral_db/viral_protein".format(path=self.path),self.nr,self.nt]
        status = []
        self.log("START checking blast db")

        for db in dbs:
            x = os.system("{path}/external_tools/blast/blastdbcmd -db {db} -info >> {out}/vrap.log 2>> {out}/vrap.log".format(path=self.path,db=db,out=self.out))
            status.append(x)

        #COMMENTED
        if status[0] != 0 or status[1] != 0:
            self.update_database()

        if self.nr != None and status[2] != 0:
            print("No alias or index file found for {}".format(self.nr))
            quit()

        if self.nt != None and status[3] != 0:
            print("No alias or index file found for {}".format(self.nt))
            quit()

    def log (self,message,err=False):
        """ log bowtie run """
        print(message)
        LOG_FILENAME = "{}/vrap.log".format(self.out)
        FORMAT = '\n[VrAP] %(asctime)-15s %(message)s\n'

        logging.basicConfig(filename=LOG_FILENAME,format=FORMAT,level=logging.DEBUG)
        if err:
            logging.exception(message)
        else:
            logging.info(message)
        #logging.exception()

    def option_parser(self):
        """ Read input options """
        usage = "usage: %prog -1 [FILE] [options]"
        parser = OptionParser(usage=usage,version="%prog {}".format(self.version))

        parser.add_option("-1","--p1", dest="p1",help="first paired end file" , metavar="FILE")
        parser.add_option("-2","--p2", dest="p2",help="second paired end file", metavar="FILE")
        parser.add_option("-t","--threads", dest="cpu",help="number of threads to use", metavar="INT",type="int",default=1)
        parser.add_option("-r","--host", dest="host",help="reference host sequences", metavar="FASTA")
        parser.add_option("-v","--virus", dest="virus",help="reference viral sequences", metavar="FASTA")
        parser.add_option("-n","--nt", dest="nt",help="nucleotide blast database (e.g. nt)", metavar="DATABASE")
        parser.add_option("-a","--nr", dest="nr",help="protein blast database (e.g. nr)", metavar="DATABASE")
        parser.add_option("-o","--outdir", dest="out",help="output directory", metavar="DIR")
        parser.add_option("-b","--bt2idx", dest="bt2idx",help="bowtie2 index", metavar="FILE")
        parser.add_option("-l","--clength", dest="clength",help="minimum contig length", metavar="INT",default=500)
        parser.add_option("-u","--update", dest="update",help="update viral database",  default=False, action="store_true")
        parser.add_option("-g","--gbt2", dest="gbowtie",help="use global PATH of bowtie2",  default=False, action="store_true")
        parser.add_option("-p","--pre", dest="pre",help="read correction only",  default=False, action="store_true")
        parser.add_option("-q","--assembly", dest="assembly",help="assembly only",  default=False, action="store_true")
        parser.add_option("-k","--noblast", dest="noblast",help="skip blast search",  default=False, action="store_true")
        self.parser = parser
        (options, args) = parser.parse_args()

        return options

    def create_output_dir(self):
        """ Create output directory """
        if not os.path.exists(self.out):
            os.makedirs(self.out)

    def EMfit2(self,F0, f1, F1, k):
        epstol = 1e-8
        x = F1 / F0
        e = 0
        e0 = 1
        x0 = 0
        maxit = 1000
        it = 0
        while abs(x - x0) / x + abs(e - e0) > epstol and it < maxit:
            it = it + 1
            #print x, e, abs(x-x0)/x
            x0 = x
            e0 = e
            x1 = -100
            while abs(x - x1) / x > epstol / 2:
                x1 = x
                x  = F1 / F0 * (3 * k * (1 - exp(-x * e / (3 * k))) + 1 - exp(-x * (1 - e)))
            def func(t):
                return f1 / F1 - (t * exp(-x * t / (3 * k)) + (1 - t) * exp(-x * (1 - t)))
            print(func(0))
            print(func(1.2))
            e = brentq(func, 0, 1.1)
        return x, e

    def genome_size (self):
        """ estimate genome size """
        self.log("START estimate k-mer size")

        lighter_path = "{}/lighter".format(self.out)
        if not os.path.exists(lighter_path):
            os.makedirs(lighter_path)

        if self.paired_end:
            input_data = "{} {}".format(os.path.abspath(self.p1),os.path.abspath(self.p2),self.out)
        else:
            input_data = "{}".format(os.path.abspath(self.p1))

        command = "{path}/external_tools/KmerStream-master/KmerStream --tsv -k 20 -t {cpu} {input_data} -o {lighter}/kmer  2>> {out}/vrap.log".format(path=self.path,lighter=lighter_path,cpu=self.cpu,out=self.out,input_data=input_data)

        os.system(command)
        size = 30000

        try:
            kmer = "{lighter}/kmer".format(lighter=lighter_path)
            if os.path.exists(kmer):
                with open(kmer) as f:
                    head = next(f)
                    value = next(f).split()

                value = [int(i) for i in value] #list comprehension
                size = value[2]-value[3]
        except Exception as e:
            vrap.log(e,True)

        self.log("k-mer size: {}".format(size))

        return size

    def lighter(self):
        """ Run ligther for read correction """
        self.log("START lighter")

        if self.paired_end:
            input_data = "-r {} -r {}".format(self.p1,self.p2)
        else:
            input_data = "-r {}".format(self.p1)

        size = self.genome_size()
        if size<30000:
            size = 30000
        #MODIFIED, since lighter in external_tool doesn't work.
        #{path}/external_tools/Lighter-master/lighter
        command = "/home/jhuang/anaconda3/bin/lighter {input} -K 20 {size} -od {out}/lighter -t {cpu} 2>> {out}/vrap.log".format(path=self.path,input=input_data,out=self.out,cpu=self.cpu,size=size)
        print(command)
        os.system(command)

        if self.paired_end:
            b1 = os.path.basename(self.p1).split(".")[0]
            self.cor1 = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0]
            b2 = os.path.basename(self.p2).split(".")[0]
            self.cor2 = glob.glob("{}/lighter/{}*cor*".format(self.out,b2))[0]
        else:
            b1 = os.path.basename(self.single).split(".")[0]
            self.single = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0]

    def flash (self):
        """ Run flash for paired end reads """
        self.log("START flash")

        if self.paired_end:
            command = "{path}/external_tools/FLASH-1.2.11/flash {cor1} {cor2} -o flash -d {out}/flash -M 1000 >> {out}/vrap.log".format(path=self.path,cor1=self.cor1,cor2=self.cor2,out=self.out)
            print(command)
            os.system(command)

            self.cor1   = "{}/flash/flash.notCombined_1.fastq".format(self.out)
            self.cor2   = "{}/flash/flash.notCombined_2.fastq".format(self.out)
            self.single = "{}/flash/flash.extendedFrags.fastq".format(self.out)

    def bowtie (self):
        """ Map reads to reference sequence """
        bowtie2 = Bowtie(self.gbt2,self.out,self.cpu)

        bowtie_path = "{}/bowtie".format(self.out)
        if not os.path.exists(bowtie_path):
            os.makedirs(bowtie_path)

        if self.bt2idx==None and self.host!=None:
            self.log("START bowtie2-build")
            self.bt2idx = "{}/bowtie/host".format(self.out)
            bowtie2.build(self.host)

        if self.bt2idx != None:
            if self.paired_end:
                self.log("START bowtie2 pe")
                bowtie2.pe(self.cor1,self.cor2,self.single,self.bt2idx)
                self.cor1 = "{}/bowtie/bowtie.un.1.fastq".format(self.out)
                self.cor2 = "{}/bowtie/bowtie.un.2.fastq".format(self.out)
                self.single = "{}/bowtie/bowtie.un.fastq".format(self.out)
            else:
                self.log("START bowtie2 se")
                bowtie2.se(self.single,self.bt2idx)
                self.single = "{}/bowtie/bowtie.un.fastq".format(self.out)

    def update_database (self):
        """ Update viral DB"""
        #self.log("START update viral DB")
        #EXPLANATION: maybe it needs to DELETE the directory viral_db.
        #typs = ["nucleotide"]
        #typs = ["protein"]
        typs = ["nucleotide","protein"]
        #path = os.path.dirname(os.path.realpath(__file__))
        db_path = "{}/database/viral_db/".format(self.path)

        if not os.path.exists(db_path):
                os.makedirs(db_path)

        for typ in typs:
            self.log("START update viral {} DB".format(typ))
            database = Database(typ,db_path,self.cpu)

            database.download_id()
            database.start_download()

            blast = Blast(self.cpu, self.out)
            db_type = "nucl"
            if "protein" in typ:
                db_type = "prot"

            self.log("\nSTART {} makeblastdb".format(typ))
            blast.makeblastdb(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type)
            #print(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type)
            #/home/jhuang/Tools/vrap/database/viral_db/nucleotide.fa /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide nucl
            #MANUAL_RUNNING
            #V5 didn't suitable
            #/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in nuleotide.fa -dbtype nucl
            #/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in protein.fa -dbtype prot
            #USING_V4
            #https://www.biostars.org/p/200742/
            #vrap/external_tools/blast/makeblastdb -in nucleotide.fa -dbtype nucl -parse_seqids -out virus_nucleotide  #-taxid_map nucl_gb.accession2taxid 
            #vrap/external_tools/blast/makeblastdb -in protein.fa -dbtype prot -parse_seqids -out virus_protein     #-taxid_map nucl_gb.accession2taxid
            #search_term="txid2697049[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date)
            #virus_search_term="txid10239[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date)
            #txid2=Bacteria, txid2759=Eukaryota, txid2157=Archaea, txid10239=Viruses, txid2697049=Severe acute respiratory syndrome coronavirus 2

    def spades (self):
        """ Run spades """
        self.log("START spades")
        paired = ""

        if self.paired_end:
            paired  = "-1 {} -2 {}".format(self.cor1,self.cor2)

        k       = "-k 33,55,77,99,127"
        options = "--cov-cutoff off --only-assembler --careful"
        #options = "--cov-cutoff off --only-assembler"

        single  = self.single
        if len(self.single)>0:
            single  = "--s1 {}".format(self.single)

        cpu     = "-t {}".format(self.cpu)
        output  = "-o {}/spades".format(self.out)
        os.system("{path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py {paired} {single} {k} {options} {cpu} {output} >> {out}/vrap.log".format(path=self.path, paired=paired, single=single, k=k, options=options, cpu=cpu, output=output,out=self.out))

        #for assembly only
        self.contigs = "{}/spades/contigs.fasta".format(self.out)
        self.exclude = set()
        self.scaffold = {}

    def cap3 (self):
        """ Runs cap3 """
        self.log("START cap3")
        self.contigs = "{}/spades/contigs.fasta".format(self.out)
        cap3_out = "{}/cap3".format(self.out)
        #print(cap3_out)
        if not os.path.exists(cap3_out):
            os.makedirs(cap3_out)

        os.system("{}/external_tools/cap3 '{}' -y 100 >{}/cap3/cap3.out".format(self.path,self.contigs,self.out))

        b = False
        i = 0
        consensi = {}
        consensus = ""
        group = {}
        name = ""
        exclude = set()

        with open("{}/cap3/cap3.out".format(self.out)) as f:
            for line in f:

                if not b:

                    if "*******************" in line:
                        name = line.split("* ")[1].split(" *")[0]
                        #print(name)
                        group[name] = set()

                    if "NODE" in line:
                        ary = line.split()
                        for n in ary:
                            if "NODE" in n:
                                tmp = n.replace("+","").replace("-","")
                                group[name].add(tmp)
                                exclude.add(tmp)

                if "DETAILED" in line:
                    b = True

                if b:

                    if "*******************" in line:
                        name = line.split("* ")[1].split(" *")[0]
                        #print(name)

                    ary = line.split()
                    #print(ary)
                    if len(ary) > 0:

                        if "consensus" in line:
                            seq = ary[1]
                            tmp = consensi.get(name,"")
                            tmp+=seq.replace("-","N")
                            consensi[name] = tmp

        self.scaffold = consensi
        self.exclude = exclude

    def clean_output (self):
        """ Clean spades output and include scaffolds """

        fasta_sequences = SeqIO.parse(open(self.contigs),'fasta')
        self.result = "{}/vrap_contig.fasta".format(self.out)
        output = open(self.result,"w")

        for fasta in fasta_sequences:
            if fasta.name not in self.exclude and len(fasta.seq)>=self.contig_length:
                output.write(">{}\n{}\n".format(fasta.name,fasta.seq))
        i = 1
        for name,seq in self.scaffold.items():
            if len(seq)>=self.contig_length:
                output.write(">CAP_{}_length_{}\n{}\n".format(i,len(seq),seq))
                i+=1
        output.close()

    def calculate_orf_density (self):
        """ calculate orf density """
        self.log("START calculating orf density")
        self.result = "{}/vrap_contig.fasta".format(self.out)
        orf = ORF(self.result,self.out)
        self.orf_density = orf.orf_density()

    def label_as_virus (self):
        """ Add virus tag to name """
        out_name = "{}/blast/custom_viral_seq.fa".format(self.out)
        output = open(out_name,"w")

        with open(self.virus, "r") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                output.write(">{}\n{}\n".format(record.name+" [virus_user_db]",record.seq))

        output.close()

        return out_name

    def blast (self):
        """ blast """
        self.log("START blast")
        self.result = "{}/vrap_contig.fasta".format(self.out)

        blast_path = "{}/blast".format(self.out)
        blast = Blast(self.cpu, self.out)

        blast.read_fasta(self.result)

        #EXPLANATION: all virus and host-databases are saved in nt_dbs
        #             blast performed only after assembly(self.result=vrap_contig.fasta) against host+virus --> 
        #             since the host-reads are not deleted before the assembly, 
        #             vrap_contigs.fasta should contain contigs of host and virus
        #sort blast/blastn.csv and blast/blastn.fa
        nt_dbs = "{}/database/viral_db{}/viral_nucleotide".format(self.path,old)
        e_val = "1e-4"
        max_target = 1

        if not os.path.exists(blast_path):
            os.makedirs(blast_path)

        if not os.path.exists(blast_path+"/db"):
            os.makedirs(blast_path+"/db")

        #EXPLANATION
        #if the option --virus is set, make the given fasta blast-format and add it to vrap_out/blast/db/virus 
        #NO record added! nt_dbs should keep only 1 record!
        if self.virus != None:
            custom_viral_seq = self.label_as_virus()
            blast.makeblastdb(custom_viral_seq,"{}/blast/db/virus".format(self.out),"nucl")
            nt_dbs += " {}/blast/db/virus".format(self.out)
        #MODIFIED, add the self.host causing error. COMMENTED them!
        #EXPLANATION: if the option --host is set, make the given fasta blast-format and add it to vrap_out/blast/db/host
        #There are in total 3 nt_dbs, 1 prot_db, +[nt][nr], and 1 bowtie database:
        # nt_dbs /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide, ./blast/db/virus, ./blast/db/host
        # prot_db /home/jhuang/Tools/vrap/database/viral_db/viral_protein
        # bowtie_database: ./bowtie/host for substraction host-reads
        #DEBUG: /home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 4 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/db/host"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/blastn.csv
        #--> it works!
        #Warning: [blastn] Examining 5 or more matches is recommended
        #grep -v "scrofa" blastn.csv > blastn_.csv;
        #DEBUG: BLAST Database creation error: Error: Duplicate seq_ids are found: GB|MF170920.1
        #if self.host != None:
        #   blast.makeblastdb(self.host,"{}/blast/db/host".format(self.out),"nucl")
        #   nt_dbs += " {}/blast/db/host".format(self.out)

        #EXPLANATION: blastn.csv is empty means blastn finds nonthing!
        ##nucleotide db search
        blast_out = "{}/blast/blastn.csv".format(self.out)
        blast.blastn(self.result, nt_dbs, e_val, max_target, blast_out)
        no_match_fa = "{}/blast/blastn.fa".format(self.out)
        no_results_found = blast.no_match(blast_out,no_match_fa)

        #EXPLANATION: blastx -num_threads 14 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1399_1400_Serum_vrap_out/blast/blastn.fa -db /home/jhuang/Tools/vrap/database/viral_db/viral_protein -evalue 1e-6 -outfmt 6 qseqid qstart qend sstart send evalue le+ 
        ##protein db search
        if no_results_found > 0:
            e_val = "1e-6"
            blast_out = "{}/blast/blastx.csv".format(self.out)
            prot_db = "{}/database/viral_db{}/viral_protein".format(self.path,old)
            blast.blastx(no_match_fa, prot_db, e_val, max_target, blast_out)
            no_match_fa = "{}/blast/blastx.fa".format(self.out)
            no_results_found = blast.no_match(blast_out,no_match_fa)

        #EXPLANATION: since self.nt and self.nr are not given, the following code won't run!
        if self.nt != None and no_results_found > 0:
            e_val = "1e-4"
            blast_out = "{}/blast/blastn_nt.csv".format(self.out)
            blast.blastn(no_match_fa, self.nt, e_val, max_target, blast_out)
            no_match_fa = "{}/blast/blastn_nt.fa".format(self.out)
            no_results_found = blast.no_match(blast_out,no_match_fa)

        if self.nr != None and no_results_found > 0:
            e_val = "1e-6"
            blast_out = "{}/blast/blastx_nr.csv".format(self.out)
            blast.blastx(no_match_fa, self.nr, e_val, max_target, blast_out)
            no_match_fa = "{}/blast/blastx_nr.fa".format(self.out)
            no_results_found = blast.no_match(blast_out,no_match_fa)

    def hmmer (self):
        """ hidden markov modell search """
        self.log("START HMMER")
        hmmer = HMMER(self.cpu, self.out)
        hmmer.run_hmmer()

    def summary (self):
        """ summarize output """

        if len(self.orf_density)>0:
            blast = Blast(self.cpu, self.out)
            self.blast_results = blast.summary(self.out)

            hmmer = HMMER(self.cpu, self.out)
            hmmer_output = hmmer.read_hmmer_output()

        with open("{}/vrap_summary{}.csv".format(self.out,old),"w") as f:
            f.write("name;qleng;orf_d;hmmer_eval;hmm_model;virus?;ident;qcov;tcov;tlength;tid;tname;mean_eval\n")
            for name,orf_d in self.orf_density.items():
                results = {"qlength":0,"tid":"-","tname":"-","tlength":0,"qcovarage":0,"ident":0,"tcovarage":0,"eval":1}

                if name in self.blast_results:
                    results = self.blast_results[name]

                hmmer = 1
                hmm_model = ""

                if name in hmmer_output:
                    hmmer,hmm_model = max(hmmer_output[name],key=lambda x:x[0])

                qleng = int(name.split("_")[3])
                virus = "host"

                if "virus" in results["tname"] or "phage" in results["tname"]:
                    virus = "virus"
                else:
                    if (len(results["tname"])<2):
                        if ((orf_d>0.9 and qleng <=1000) or (orf_d>0.7 and qleng >1000)):
                            virus = "potential virus"
                        else:
                            virus = "-"

                    if hmmer<=1e-5:
                        virus = "virus"

                tmp ="{0};{1:d};{2:.2f};{3:.2e};{4};{5};{6:.2f};{7:.2f};{8:.2f};{9:d};{10};{11};{12:.2e}\n".format(name,qleng,orf_d,hmmer,hmm_model,virus,results["ident"],results["qcovarage"],results["tcovarage"],results["tlength"],results["tid"],results["tname"].replace(";",","),results["eval"])
                f.write(tmp)

vrap = VrAP()

try:
    if not vrap.assembly:
        vrap.lighter()
        vrap.flash()

    if not vrap.pre:
        if not vrap.assembly:
            vrap.bowtie()

        vrap.spades()

        if not vrap.assembly:
            vrap.cap3()

        vrap.clean_output()

        vrap.calculate_orf_density()

        if len(vrap.orf_density)>0:
            vrap.hmmer()
            if not vrap.noblast:
                vrap.blast()

        vrap.summary()

    vrap.log("VrAP pipeline finished.\nThank you for using VrAP!")
except Exception as e:
    vrap.log(e,True)

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

How to run the pipeline vrap?

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章