How to run the pipeline vrap?

gene_x 0 like s 623 view s

Tags: processing, virus, pipeline

  1. download viral database

    cp download_db.py down_db.py.backup
    sed -i -e 's/39733/1396/g' download_db.py
    #NOTE the download_db.py cannot run alone, rather than via "vrap.py -u"
    (damian) jhuang@hamburg:~/DATA/Data_Tam_Acinetobacter_baumannii$ ./vrap/vrap.py -u
    mv vrap/database/viral_db vrap/database/viral_db_Bacillus_cereus
    
  2. run vrap.py

    #--host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa 
    vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/ATCC19606.fasta -n /media/jhuang/Titisee/GAMOLA2/Blast_db/nt -a /media/jhuang/Titisee/GAMOLA2/Blast_db/nr -t 4
    #vrap/vrap.py -1 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R1_001.fastq.gz -2 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R2_001.fastq.gz -o BovHepV_Bulgaria_19_on_Hepacivirus --host=/home/jhuang/REFs/Bos_taurus.ARS-UCD1.2.dna.toplevel.fa  -t 3
    #IMPORTANT, it doesn't work, since the update will always downlod the repeated sequences causing ERROR! It needs to delete it as follows!!
    
    #In summary: --(optional) host --> bowtie2/host (bowtie_database for host-substraction) + blast/db/host (in nt_dbs, custom blastn database before downloaded_db!)
    #            --(optional) virus --> blast/db/virus (in nt_dbs, custom blastn database before downloaded_db!)
    #            --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (nt_dbs)
    #                                                                                        --> database/viral_db/viral_protein (prot_db)
    #            --(optional) nt+nr
    #Therefore, the results substracted host should not need the blast with host, however it it in nt_dbs!
    
    #Under (vrap) jhuang@hamm:~/DATA/Data_Tam_Acinetobacter_baumannii$ 
    #DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter
    #DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py
    
    #A6: --host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa  --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta
    vrap/vrap.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
    
    #A10 
    #START checking blast db: no host and with virus --> mapping to 5 dbs: blast/db/virus, viral_nucleotide, viral_protein, nt, nr
    vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta   -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
    
    #A12
    vrap/vrap.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
    
    #A19 
    vrap/vrap.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
    
    #START VrAP
    #START checking blast db
    #START lighter
    #START estimate k-mer size
    #k-mer size: 5964136
    #/home/jhuang/miniconda3/envs/vrap/bin/lighter -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -K 20 5964136 -od /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter -t 55 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log
    #START flash
    #/home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_1.cor.fq.gz /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_2.cor.fq.gz -o flash -d /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/flash -M 1000 >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log
    #START spades
    
  3. vrap.py (code)

    #!/usr/bin/env python3
    
    from orf import ORF
    from optparse import OptionParser
    import os
    import glob
    from Bio import SeqIO
    from pathlib import Path
    # see blast.py
    # .ncbirc --> BLASTDB=/media/jhuang/Titisee/GAMOLA2/Blast_db/ --> /mnt/Samsung_T5/Blast_db/
    from blast import Blast
    from hmmer import HMMER
    from bowtie import Bowtie
    # download_db.py only for txid2697049 [Severe acute respiratory syndrome coronavirus 2]
    from download_db import Database
    import glob
    import logging
    from math import exp
    from scipy.optimize import brentq
    
    #old = "_2010"
    old = ""
    
    class VrAP:
        """ VrAP """
    
        def __init__ (self):
            """ Class initialiser """
            self.version = "1.0.5"
            option = self.option_parser()
            self.path = os.path.dirname(os.path.realpath(__file__))
    
            self.paired_end = True
            self.p1  = option.p1
            self.p2  = option.p2
            self.single = ""
            self.cor1 = self.p1
            self.cor2 = self.p2
            self.cpu = option.cpu
            self.contigs = ""
            self.contig_length = option.clength
            self.host = option.host
            self.virus = option.virus
            self.nt = option.nt
            self.nr = option.nr
            self.bt2idx = option.bt2idx
            self.update = option.update
            self.gbt2 = option.gbowtie
            self.pre = option.pre
            self.assembly = option.assembly
            self.noblast = option.noblast
    
            if option.out!=None:
                self.out = os.path.abspath(option.out)
            else:
                self.out = "./"
    
            if self.update:
                self.update_database()
    
            if self.cpu==None:
                self.cpu = 1
    
            if self.p2==None:
                self.single = self.p1
                self.paired_end = False
    
            self.create_output_dir()
    
            open("{}/vrap.log".format(self.out),"w").close()
            self.log("START VrAP")
    
            with open("{}/vrap.log".format(self.out),"a") as log_opt:
                for opt, value in option.__dict__.items():
                    log_opt.write("{}:\t{}\n".format(opt,value))
    
            if self.pre:
                self.log("START only read correction and assembly!")
    
            self.check_input()
    
        def check_input (self):
            """ check input """
    
            bt2idx = self.bt2idx
            if self.bt2idx!=None:
                bt2idx = self.bt2idx+".1.bt2"
    
            input_files = [self.p1,self.p2,self.host,self.virus,bt2idx]
    
            for f in input_files:
                #print(f)
                if f != None:
                    if not os.path.exists(f):
    
                        print("{} don't exists\nPlease check input files\nVrAP quit\n".format(f))
                        quit()
    
            if self.p1 == None:
                self.parser.print_help()
                print("-1/--p1 is mandatory\nPlease check input file\nVrAP quit\n")
                quit()
    
            #EXPLANATION: in total, mapping to 4 dbs: viral_nucleotide, viral_protein, nt, nr
            dbs = ["{path}/database/viral_db/viral_nucleotide".format(path=self.path),"{path}/database/viral_db/viral_protein".format(path=self.path),self.nr,self.nt]
            status = []
            self.log("START checking blast db")
    
            for db in dbs:
                x = os.system("{path}/external_tools/blast/blastdbcmd -db {db} -info >> {out}/vrap.log 2>> {out}/vrap.log".format(path=self.path,db=db,out=self.out))
                status.append(x)
    
            #COMMENTED
            if status[0] != 0 or status[1] != 0:
                self.update_database()
    
            if self.nr != None and status[2] != 0:
                print("No alias or index file found for {}".format(self.nr))
                quit()
    
            if self.nt != None and status[3] != 0:
                print("No alias or index file found for {}".format(self.nt))
                quit()
    
        def log (self,message,err=False):
            """ log bowtie run """
            print(message)
            LOG_FILENAME = "{}/vrap.log".format(self.out)
            FORMAT = '\n[VrAP] %(asctime)-15s %(message)s\n'
    
            logging.basicConfig(filename=LOG_FILENAME,format=FORMAT,level=logging.DEBUG)
            if err:
                logging.exception(message)
            else:
                logging.info(message)
            #logging.exception()
    
        def option_parser(self):
            """ Read input options """
            usage = "usage: %prog -1 [FILE] [options]"
            parser = OptionParser(usage=usage,version="%prog {}".format(self.version))
    
            parser.add_option("-1","--p1", dest="p1",help="first paired end file" , metavar="FILE")
            parser.add_option("-2","--p2", dest="p2",help="second paired end file", metavar="FILE")
            parser.add_option("-t","--threads", dest="cpu",help="number of threads to use", metavar="INT",type="int",default=1)
            parser.add_option("-r","--host", dest="host",help="reference host sequences", metavar="FASTA")
            parser.add_option("-v","--virus", dest="virus",help="reference viral sequences", metavar="FASTA")
            parser.add_option("-n","--nt", dest="nt",help="nucleotide blast database (e.g. nt)", metavar="DATABASE")
            parser.add_option("-a","--nr", dest="nr",help="protein blast database (e.g. nr)", metavar="DATABASE")
            parser.add_option("-o","--outdir", dest="out",help="output directory", metavar="DIR")
            parser.add_option("-b","--bt2idx", dest="bt2idx",help="bowtie2 index", metavar="FILE")
            parser.add_option("-l","--clength", dest="clength",help="minimum contig length", metavar="INT",default=500)
            parser.add_option("-u","--update", dest="update",help="update viral database",  default=False, action="store_true")
            parser.add_option("-g","--gbt2", dest="gbowtie",help="use global PATH of bowtie2",  default=False, action="store_true")
            parser.add_option("-p","--pre", dest="pre",help="read correction only",  default=False, action="store_true")
            parser.add_option("-q","--assembly", dest="assembly",help="assembly only",  default=False, action="store_true")
            parser.add_option("-k","--noblast", dest="noblast",help="skip blast search",  default=False, action="store_true")
            self.parser = parser
            (options, args) = parser.parse_args()
    
            return options
    
        def create_output_dir(self):
            """ Create output directory """
            if not os.path.exists(self.out):
                os.makedirs(self.out)
    
        def EMfit2(self,F0, f1, F1, k):
            epstol = 1e-8
            x = F1 / F0
            e = 0
            e0 = 1
            x0 = 0
            maxit = 1000
            it = 0
            while abs(x - x0) / x + abs(e - e0) > epstol and it < maxit:
                it = it + 1
                #print x, e, abs(x-x0)/x
                x0 = x
                e0 = e
                x1 = -100
                while abs(x - x1) / x > epstol / 2:
                    x1 = x
                    x  = F1 / F0 * (3 * k * (1 - exp(-x * e / (3 * k))) + 1 - exp(-x * (1 - e)))
                def func(t):
                    return f1 / F1 - (t * exp(-x * t / (3 * k)) + (1 - t) * exp(-x * (1 - t)))
                print(func(0))
                print(func(1.2))
                e = brentq(func, 0, 1.1)
            return x, e
    
        def genome_size (self):
            """ estimate genome size """
            self.log("START estimate k-mer size")
    
            lighter_path = "{}/lighter".format(self.out)
            if not os.path.exists(lighter_path):
                os.makedirs(lighter_path)
    
            if self.paired_end:
                input_data = "{} {}".format(os.path.abspath(self.p1),os.path.abspath(self.p2),self.out)
            else:
                input_data = "{}".format(os.path.abspath(self.p1))
    
            command = "{path}/external_tools/KmerStream-master/KmerStream --tsv -k 20 -t {cpu} {input_data} -o {lighter}/kmer  2>> {out}/vrap.log".format(path=self.path,lighter=lighter_path,cpu=self.cpu,out=self.out,input_data=input_data)
    
            os.system(command)
            size = 30000
    
            try:
                kmer = "{lighter}/kmer".format(lighter=lighter_path)
                if os.path.exists(kmer):
                    with open(kmer) as f:
                        head = next(f)
                        value = next(f).split()
    
                    value = [int(i) for i in value] #list comprehension
                    size = value[2]-value[3]
            except Exception as e:
                vrap.log(e,True)
    
            self.log("k-mer size: {}".format(size))
    
            return size
    
        def lighter(self):
            """ Run ligther for read correction """
            self.log("START lighter")
    
            if self.paired_end:
                input_data = "-r {} -r {}".format(self.p1,self.p2)
            else:
                input_data = "-r {}".format(self.p1)
    
            size = self.genome_size()
            if size<30000:
                size = 30000
            #MODIFIED, since lighter in external_tool doesn't work.
            #{path}/external_tools/Lighter-master/lighter
            command = "/home/jhuang/anaconda3/bin/lighter {input} -K 20 {size} -od {out}/lighter -t {cpu} 2>> {out}/vrap.log".format(path=self.path,input=input_data,out=self.out,cpu=self.cpu,size=size)
            print(command)
            os.system(command)
    
            if self.paired_end:
                b1 = os.path.basename(self.p1).split(".")[0]
                self.cor1 = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0]
                b2 = os.path.basename(self.p2).split(".")[0]
                self.cor2 = glob.glob("{}/lighter/{}*cor*".format(self.out,b2))[0]
            else:
                b1 = os.path.basename(self.single).split(".")[0]
                self.single = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0]
    
        def flash (self):
            """ Run flash for paired end reads """
            self.log("START flash")
    
            if self.paired_end:
                command = "{path}/external_tools/FLASH-1.2.11/flash {cor1} {cor2} -o flash -d {out}/flash -M 1000 >> {out}/vrap.log".format(path=self.path,cor1=self.cor1,cor2=self.cor2,out=self.out)
                print(command)
                os.system(command)
    
                self.cor1   = "{}/flash/flash.notCombined_1.fastq".format(self.out)
                self.cor2   = "{}/flash/flash.notCombined_2.fastq".format(self.out)
                self.single = "{}/flash/flash.extendedFrags.fastq".format(self.out)
    
        def bowtie (self):
            """ Map reads to reference sequence """
            bowtie2 = Bowtie(self.gbt2,self.out,self.cpu)
    
            bowtie_path = "{}/bowtie".format(self.out)
            if not os.path.exists(bowtie_path):
                os.makedirs(bowtie_path)
    
            if self.bt2idx==None and self.host!=None:
                self.log("START bowtie2-build")
                self.bt2idx = "{}/bowtie/host".format(self.out)
                bowtie2.build(self.host)
    
            if self.bt2idx != None:
                if self.paired_end:
                    self.log("START bowtie2 pe")
                    bowtie2.pe(self.cor1,self.cor2,self.single,self.bt2idx)
                    self.cor1 = "{}/bowtie/bowtie.un.1.fastq".format(self.out)
                    self.cor2 = "{}/bowtie/bowtie.un.2.fastq".format(self.out)
                    self.single = "{}/bowtie/bowtie.un.fastq".format(self.out)
                else:
                    self.log("START bowtie2 se")
                    bowtie2.se(self.single,self.bt2idx)
                    self.single = "{}/bowtie/bowtie.un.fastq".format(self.out)
    
        def update_database (self):
            """ Update viral DB"""
            #self.log("START update viral DB")
            #EXPLANATION: maybe it needs to DELETE the directory viral_db.
            #typs = ["nucleotide"]
            #typs = ["protein"]
            typs = ["nucleotide","protein"]
            #path = os.path.dirname(os.path.realpath(__file__))
            db_path = "{}/database/viral_db/".format(self.path)
    
            if not os.path.exists(db_path):
                    os.makedirs(db_path)
    
            for typ in typs:
                self.log("START update viral {} DB".format(typ))
                database = Database(typ,db_path,self.cpu)
    
                database.download_id()
                database.start_download()
    
                blast = Blast(self.cpu, self.out)
                db_type = "nucl"
                if "protein" in typ:
                    db_type = "prot"
    
                self.log("\nSTART {} makeblastdb".format(typ))
                blast.makeblastdb(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type)
                #print(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type)
                #/home/jhuang/Tools/vrap/database/viral_db/nucleotide.fa /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide nucl
                #MANUAL_RUNNING
                #V5 didn't suitable
                #/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in nuleotide.fa -dbtype nucl
                #/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in protein.fa -dbtype prot
                #USING_V4
                #https://www.biostars.org/p/200742/
                #vrap/external_tools/blast/makeblastdb -in nucleotide.fa -dbtype nucl -parse_seqids -out virus_nucleotide  #-taxid_map nucl_gb.accession2taxid 
                #vrap/external_tools/blast/makeblastdb -in protein.fa -dbtype prot -parse_seqids -out virus_protein     #-taxid_map nucl_gb.accession2taxid
                #search_term="txid2697049[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date)
                #virus_search_term="txid10239[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date)
                #txid2=Bacteria, txid2759=Eukaryota, txid2157=Archaea, txid10239=Viruses, txid2697049=Severe acute respiratory syndrome coronavirus 2
    
        def spades (self):
            """ Run spades """
            self.log("START spades")
            paired = ""
    
            if self.paired_end:
                paired  = "-1 {} -2 {}".format(self.cor1,self.cor2)
    
            k       = "-k 33,55,77,99,127"
            options = "--cov-cutoff off --only-assembler --careful"
            #options = "--cov-cutoff off --only-assembler"
    
            single  = self.single
            if len(self.single)>0:
                single  = "--s1 {}".format(self.single)
    
            cpu     = "-t {}".format(self.cpu)
            output  = "-o {}/spades".format(self.out)
            os.system("{path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py {paired} {single} {k} {options} {cpu} {output} >> {out}/vrap.log".format(path=self.path, paired=paired, single=single, k=k, options=options, cpu=cpu, output=output,out=self.out))
    
            #for assembly only
            self.contigs = "{}/spades/contigs.fasta".format(self.out)
            self.exclude = set()
            self.scaffold = {}
    
        def cap3 (self):
            """ Runs cap3 """
            self.log("START cap3")
            self.contigs = "{}/spades/contigs.fasta".format(self.out)
            cap3_out = "{}/cap3".format(self.out)
            #print(cap3_out)
            if not os.path.exists(cap3_out):
                os.makedirs(cap3_out)
    
            os.system("{}/external_tools/cap3 '{}' -y 100 >{}/cap3/cap3.out".format(self.path,self.contigs,self.out))
    
            b = False
            i = 0
            consensi = {}
            consensus = ""
            group = {}
            name = ""
            exclude = set()
    
            with open("{}/cap3/cap3.out".format(self.out)) as f:
                for line in f:
    
                    if not b:
    
                        if "*******************" in line:
                            name = line.split("* ")[1].split(" *")[0]
                            #print(name)
                            group[name] = set()
    
                        if "NODE" in line:
                            ary = line.split()
                            for n in ary:
                                if "NODE" in n:
                                    tmp = n.replace("+","").replace("-","")
                                    group[name].add(tmp)
                                    exclude.add(tmp)
    
                    if "DETAILED" in line:
                        b = True
    
                    if b:
    
                        if "*******************" in line:
                            name = line.split("* ")[1].split(" *")[0]
                            #print(name)
    
                        ary = line.split()
                        #print(ary)
                        if len(ary) > 0:
    
                            if "consensus" in line:
                                seq = ary[1]
                                tmp = consensi.get(name,"")
                                tmp+=seq.replace("-","N")
                                consensi[name] = tmp
    
            self.scaffold = consensi
            self.exclude = exclude
    
        def clean_output (self):
            """ Clean spades output and include scaffolds """
    
            fasta_sequences = SeqIO.parse(open(self.contigs),'fasta')
            self.result = "{}/vrap_contig.fasta".format(self.out)
            output = open(self.result,"w")
    
            for fasta in fasta_sequences:
                if fasta.name not in self.exclude and len(fasta.seq)>=self.contig_length:
                    output.write(">{}\n{}\n".format(fasta.name,fasta.seq))
            i = 1
            for name,seq in self.scaffold.items():
                if len(seq)>=self.contig_length:
                    output.write(">CAP_{}_length_{}\n{}\n".format(i,len(seq),seq))
                    i+=1
            output.close()
    
        def calculate_orf_density (self):
            """ calculate orf density """
            self.log("START calculating orf density")
            self.result = "{}/vrap_contig.fasta".format(self.out)
            orf = ORF(self.result,self.out)
            self.orf_density = orf.orf_density()
    
        def label_as_virus (self):
            """ Add virus tag to name """
            out_name = "{}/blast/custom_viral_seq.fa".format(self.out)
            output = open(out_name,"w")
    
            with open(self.virus, "r") as handle:
                for record in SeqIO.parse(handle, "fasta"):
                    output.write(">{}\n{}\n".format(record.name+" [virus_user_db]",record.seq))
    
            output.close()
    
            return out_name
    
        def blast (self):
            """ blast """
            self.log("START blast")
            self.result = "{}/vrap_contig.fasta".format(self.out)
    
            blast_path = "{}/blast".format(self.out)
            blast = Blast(self.cpu, self.out)
    
            blast.read_fasta(self.result)
    
            #EXPLANATION: all virus and host-databases are saved in nt_dbs
            #             blast performed only after assembly(self.result=vrap_contig.fasta) against host+virus --> 
            #             since the host-reads are not deleted before the assembly, 
            #             vrap_contigs.fasta should contain contigs of host and virus
            #sort blast/blastn.csv and blast/blastn.fa
            nt_dbs = "{}/database/viral_db{}/viral_nucleotide".format(self.path,old)
            e_val = "1e-4"
            max_target = 1
    
            if not os.path.exists(blast_path):
                os.makedirs(blast_path)
    
            if not os.path.exists(blast_path+"/db"):
                os.makedirs(blast_path+"/db")
    
            #EXPLANATION
            #if the option --virus is set, make the given fasta blast-format and add it to vrap_out/blast/db/virus 
            #NO record added! nt_dbs should keep only 1 record!
            if self.virus != None:
                custom_viral_seq = self.label_as_virus()
                blast.makeblastdb(custom_viral_seq,"{}/blast/db/virus".format(self.out),"nucl")
                nt_dbs += " {}/blast/db/virus".format(self.out)
            #MODIFIED, add the self.host causing error. COMMENTED them!
            #EXPLANATION: if the option --host is set, make the given fasta blast-format and add it to vrap_out/blast/db/host
            #There are in total 3 nt_dbs, 1 prot_db, +[nt][nr], and 1 bowtie database:
            # nt_dbs /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide, ./blast/db/virus, ./blast/db/host
            # prot_db /home/jhuang/Tools/vrap/database/viral_db/viral_protein
            # bowtie_database: ./bowtie/host for substraction host-reads
            #DEBUG: /home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 4 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/db/host"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/blastn.csv
            #--> it works!
            #Warning: [blastn] Examining 5 or more matches is recommended
            #grep -v "scrofa" blastn.csv > blastn_.csv;
            #DEBUG: BLAST Database creation error: Error: Duplicate seq_ids are found: GB|MF170920.1
            #if self.host != None:
            #   blast.makeblastdb(self.host,"{}/blast/db/host".format(self.out),"nucl")
            #   nt_dbs += " {}/blast/db/host".format(self.out)
    
            #EXPLANATION: blastn.csv is empty means blastn finds nonthing!
            ##nucleotide db search
            blast_out = "{}/blast/blastn.csv".format(self.out)
            blast.blastn(self.result, nt_dbs, e_val, max_target, blast_out)
            no_match_fa = "{}/blast/blastn.fa".format(self.out)
            no_results_found = blast.no_match(blast_out,no_match_fa)
    
            #EXPLANATION: blastx -num_threads 14 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1399_1400_Serum_vrap_out/blast/blastn.fa -db /home/jhuang/Tools/vrap/database/viral_db/viral_protein -evalue 1e-6 -outfmt 6 qseqid qstart qend sstart send evalue le+ 
            ##protein db search
            if no_results_found > 0:
                e_val = "1e-6"
                blast_out = "{}/blast/blastx.csv".format(self.out)
                prot_db = "{}/database/viral_db{}/viral_protein".format(self.path,old)
                blast.blastx(no_match_fa, prot_db, e_val, max_target, blast_out)
                no_match_fa = "{}/blast/blastx.fa".format(self.out)
                no_results_found = blast.no_match(blast_out,no_match_fa)
    
            #EXPLANATION: since self.nt and self.nr are not given, the following code won't run!
            if self.nt != None and no_results_found > 0:
                e_val = "1e-4"
                blast_out = "{}/blast/blastn_nt.csv".format(self.out)
                blast.blastn(no_match_fa, self.nt, e_val, max_target, blast_out)
                no_match_fa = "{}/blast/blastn_nt.fa".format(self.out)
                no_results_found = blast.no_match(blast_out,no_match_fa)
    
            if self.nr != None and no_results_found > 0:
                e_val = "1e-6"
                blast_out = "{}/blast/blastx_nr.csv".format(self.out)
                blast.blastx(no_match_fa, self.nr, e_val, max_target, blast_out)
                no_match_fa = "{}/blast/blastx_nr.fa".format(self.out)
                no_results_found = blast.no_match(blast_out,no_match_fa)
    
        def hmmer (self):
            """ hidden markov modell search """
            self.log("START HMMER")
            hmmer = HMMER(self.cpu, self.out)
            hmmer.run_hmmer()
    
        def summary (self):
            """ summarize output """
    
            if len(self.orf_density)>0:
                blast = Blast(self.cpu, self.out)
                self.blast_results = blast.summary(self.out)
    
                hmmer = HMMER(self.cpu, self.out)
                hmmer_output = hmmer.read_hmmer_output()
    
            with open("{}/vrap_summary{}.csv".format(self.out,old),"w") as f:
                f.write("name;qleng;orf_d;hmmer_eval;hmm_model;virus?;ident;qcov;tcov;tlength;tid;tname;mean_eval\n")
                for name,orf_d in self.orf_density.items():
                    results = {"qlength":0,"tid":"-","tname":"-","tlength":0,"qcovarage":0,"ident":0,"tcovarage":0,"eval":1}
    
                    if name in self.blast_results:
                        results = self.blast_results[name]
    
                    hmmer = 1
                    hmm_model = ""
    
                    if name in hmmer_output:
                        hmmer,hmm_model = max(hmmer_output[name],key=lambda x:x[0])
    
                    qleng = int(name.split("_")[3])
                    virus = "host"
    
                    if "virus" in results["tname"] or "phage" in results["tname"]:
                        virus = "virus"
                    else:
                        if (len(results["tname"])<2):
                            if ((orf_d>0.9 and qleng <=1000) or (orf_d>0.7 and qleng >1000)):
                                virus = "potential virus"
                            else:
                                virus = "-"
    
                        if hmmer<=1e-5:
                            virus = "virus"
    
                    tmp ="{0};{1:d};{2:.2f};{3:.2e};{4};{5};{6:.2f};{7:.2f};{8:.2f};{9:d};{10};{11};{12:.2e}\n".format(name,qleng,orf_d,hmmer,hmm_model,virus,results["ident"],results["qcovarage"],results["tcovarage"],results["tlength"],results["tid"],results["tname"].replace(";",","),results["eval"])
                    f.write(tmp)
    
    vrap = VrAP()
    
    try:
        if not vrap.assembly:
            vrap.lighter()
            vrap.flash()
    
        if not vrap.pre:
            if not vrap.assembly:
                vrap.bowtie()
    
            vrap.spades()
    
            if not vrap.assembly:
                vrap.cap3()
    
            vrap.clean_output()
    
            vrap.calculate_orf_density()
    
            if len(vrap.orf_density)>0:
                vrap.hmmer()
                if not vrap.noblast:
                    vrap.blast()
    
            vrap.summary()
    
        vrap.log("VrAP pipeline finished.\nThank you for using VrAP!")
    except Exception as e:
        vrap.log(e,True)
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum