gene_x 0 like s 623 view s
Tags: processing, virus, pipeline
download viral database
cp download_db.py down_db.py.backup
sed -i -e 's/39733/1396/g' download_db.py
#NOTE the download_db.py cannot run alone, rather than via "vrap.py -u"
(damian) jhuang@hamburg:~/DATA/Data_Tam_Acinetobacter_baumannii$ ./vrap/vrap.py -u
mv vrap/database/viral_db vrap/database/viral_db_Bacillus_cereus
run vrap.py
#--host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa
vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/ATCC19606.fasta -n /media/jhuang/Titisee/GAMOLA2/Blast_db/nt -a /media/jhuang/Titisee/GAMOLA2/Blast_db/nr -t 4
#vrap/vrap.py -1 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R1_001.fastq.gz -2 220211_M03701_0272_000000000-JWYW9/p12568/BovHepV_Bulgaria_19_S3_R2_001.fastq.gz -o BovHepV_Bulgaria_19_on_Hepacivirus --host=/home/jhuang/REFs/Bos_taurus.ARS-UCD1.2.dna.toplevel.fa -t 3
#IMPORTANT, it doesn't work, since the update will always downlod the repeated sequences causing ERROR! It needs to delete it as follows!!
#In summary: --(optional) host --> bowtie2/host (bowtie_database for host-substraction) + blast/db/host (in nt_dbs, custom blastn database before downloaded_db!)
# --(optional) virus --> blast/db/virus (in nt_dbs, custom blastn database before downloaded_db!)
# --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (nt_dbs)
# --> database/viral_db/viral_protein (prot_db)
# --(optional) nt+nr
#Therefore, the results substracted host should not need the blast with host, however it it in nt_dbs!
#Under (vrap) jhuang@hamm:~/DATA/Data_Tam_Acinetobacter_baumannii$
#DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter
#DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py
#A6: --host=/home/jhuang/REFs/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta
vrap/vrap.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A10
#START checking blast db: no host and with virus --> mapping to 5 dbs: blast/db/virus, viral_nucleotide, viral_protein, nt, nr
vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --virus=/home/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/vrap/database/viral_db_Bacillus_cereus/nucleotide_Bacillus_cereus_0.01.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A12
vrap/vrap.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#A19
vrap/vrap.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55
#START VrAP
#START checking blast db
#START lighter
#START estimate k-mer size
#k-mer size: 5964136
#/home/jhuang/miniconda3/envs/vrap/bin/lighter -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -r ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -K 20 5964136 -od /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter -t 55 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log
#START flash
#/home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_1.cor.fq.gz /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/lighter/A6_WT_HQ_trimmed_P_2.cor.fq.gz -o flash -d /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/flash -M 1000 >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A6_vrap_out/vrap.log
#START spades
vrap.py (code)
#!/usr/bin/env python3
from orf import ORF
from optparse import OptionParser
import os
import glob
from Bio import SeqIO
from pathlib import Path
# see blast.py
# .ncbirc --> BLASTDB=/media/jhuang/Titisee/GAMOLA2/Blast_db/ --> /mnt/Samsung_T5/Blast_db/
from blast import Blast
from hmmer import HMMER
from bowtie import Bowtie
# download_db.py only for txid2697049 [Severe acute respiratory syndrome coronavirus 2]
from download_db import Database
import glob
import logging
from math import exp
from scipy.optimize import brentq
#old = "_2010"
old = ""
class VrAP:
""" VrAP """
def __init__ (self):
""" Class initialiser """
self.version = "1.0.5"
option = self.option_parser()
self.path = os.path.dirname(os.path.realpath(__file__))
self.paired_end = True
self.p1 = option.p1
self.p2 = option.p2
self.single = ""
self.cor1 = self.p1
self.cor2 = self.p2
self.cpu = option.cpu
self.contigs = ""
self.contig_length = option.clength
self.host = option.host
self.virus = option.virus
self.nt = option.nt
self.nr = option.nr
self.bt2idx = option.bt2idx
self.update = option.update
self.gbt2 = option.gbowtie
self.pre = option.pre
self.assembly = option.assembly
self.noblast = option.noblast
if option.out!=None:
self.out = os.path.abspath(option.out)
else:
self.out = "./"
if self.update:
self.update_database()
if self.cpu==None:
self.cpu = 1
if self.p2==None:
self.single = self.p1
self.paired_end = False
self.create_output_dir()
open("{}/vrap.log".format(self.out),"w").close()
self.log("START VrAP")
with open("{}/vrap.log".format(self.out),"a") as log_opt:
for opt, value in option.__dict__.items():
log_opt.write("{}:\t{}\n".format(opt,value))
if self.pre:
self.log("START only read correction and assembly!")
self.check_input()
def check_input (self):
""" check input """
bt2idx = self.bt2idx
if self.bt2idx!=None:
bt2idx = self.bt2idx+".1.bt2"
input_files = [self.p1,self.p2,self.host,self.virus,bt2idx]
for f in input_files:
#print(f)
if f != None:
if not os.path.exists(f):
print("{} don't exists\nPlease check input files\nVrAP quit\n".format(f))
quit()
if self.p1 == None:
self.parser.print_help()
print("-1/--p1 is mandatory\nPlease check input file\nVrAP quit\n")
quit()
#EXPLANATION: in total, mapping to 4 dbs: viral_nucleotide, viral_protein, nt, nr
dbs = ["{path}/database/viral_db/viral_nucleotide".format(path=self.path),"{path}/database/viral_db/viral_protein".format(path=self.path),self.nr,self.nt]
status = []
self.log("START checking blast db")
for db in dbs:
x = os.system("{path}/external_tools/blast/blastdbcmd -db {db} -info >> {out}/vrap.log 2>> {out}/vrap.log".format(path=self.path,db=db,out=self.out))
status.append(x)
#COMMENTED
if status[0] != 0 or status[1] != 0:
self.update_database()
if self.nr != None and status[2] != 0:
print("No alias or index file found for {}".format(self.nr))
quit()
if self.nt != None and status[3] != 0:
print("No alias or index file found for {}".format(self.nt))
quit()
def log (self,message,err=False):
""" log bowtie run """
print(message)
LOG_FILENAME = "{}/vrap.log".format(self.out)
FORMAT = '\n[VrAP] %(asctime)-15s %(message)s\n'
logging.basicConfig(filename=LOG_FILENAME,format=FORMAT,level=logging.DEBUG)
if err:
logging.exception(message)
else:
logging.info(message)
#logging.exception()
def option_parser(self):
""" Read input options """
usage = "usage: %prog -1 [FILE] [options]"
parser = OptionParser(usage=usage,version="%prog {}".format(self.version))
parser.add_option("-1","--p1", dest="p1",help="first paired end file" , metavar="FILE")
parser.add_option("-2","--p2", dest="p2",help="second paired end file", metavar="FILE")
parser.add_option("-t","--threads", dest="cpu",help="number of threads to use", metavar="INT",type="int",default=1)
parser.add_option("-r","--host", dest="host",help="reference host sequences", metavar="FASTA")
parser.add_option("-v","--virus", dest="virus",help="reference viral sequences", metavar="FASTA")
parser.add_option("-n","--nt", dest="nt",help="nucleotide blast database (e.g. nt)", metavar="DATABASE")
parser.add_option("-a","--nr", dest="nr",help="protein blast database (e.g. nr)", metavar="DATABASE")
parser.add_option("-o","--outdir", dest="out",help="output directory", metavar="DIR")
parser.add_option("-b","--bt2idx", dest="bt2idx",help="bowtie2 index", metavar="FILE")
parser.add_option("-l","--clength", dest="clength",help="minimum contig length", metavar="INT",default=500)
parser.add_option("-u","--update", dest="update",help="update viral database", default=False, action="store_true")
parser.add_option("-g","--gbt2", dest="gbowtie",help="use global PATH of bowtie2", default=False, action="store_true")
parser.add_option("-p","--pre", dest="pre",help="read correction only", default=False, action="store_true")
parser.add_option("-q","--assembly", dest="assembly",help="assembly only", default=False, action="store_true")
parser.add_option("-k","--noblast", dest="noblast",help="skip blast search", default=False, action="store_true")
self.parser = parser
(options, args) = parser.parse_args()
return options
def create_output_dir(self):
""" Create output directory """
if not os.path.exists(self.out):
os.makedirs(self.out)
def EMfit2(self,F0, f1, F1, k):
epstol = 1e-8
x = F1 / F0
e = 0
e0 = 1
x0 = 0
maxit = 1000
it = 0
while abs(x - x0) / x + abs(e - e0) > epstol and it < maxit:
it = it + 1
#print x, e, abs(x-x0)/x
x0 = x
e0 = e
x1 = -100
while abs(x - x1) / x > epstol / 2:
x1 = x
x = F1 / F0 * (3 * k * (1 - exp(-x * e / (3 * k))) + 1 - exp(-x * (1 - e)))
def func(t):
return f1 / F1 - (t * exp(-x * t / (3 * k)) + (1 - t) * exp(-x * (1 - t)))
print(func(0))
print(func(1.2))
e = brentq(func, 0, 1.1)
return x, e
def genome_size (self):
""" estimate genome size """
self.log("START estimate k-mer size")
lighter_path = "{}/lighter".format(self.out)
if not os.path.exists(lighter_path):
os.makedirs(lighter_path)
if self.paired_end:
input_data = "{} {}".format(os.path.abspath(self.p1),os.path.abspath(self.p2),self.out)
else:
input_data = "{}".format(os.path.abspath(self.p1))
command = "{path}/external_tools/KmerStream-master/KmerStream --tsv -k 20 -t {cpu} {input_data} -o {lighter}/kmer 2>> {out}/vrap.log".format(path=self.path,lighter=lighter_path,cpu=self.cpu,out=self.out,input_data=input_data)
os.system(command)
size = 30000
try:
kmer = "{lighter}/kmer".format(lighter=lighter_path)
if os.path.exists(kmer):
with open(kmer) as f:
head = next(f)
value = next(f).split()
value = [int(i) for i in value] #list comprehension
size = value[2]-value[3]
except Exception as e:
vrap.log(e,True)
self.log("k-mer size: {}".format(size))
return size
def lighter(self):
""" Run ligther for read correction """
self.log("START lighter")
if self.paired_end:
input_data = "-r {} -r {}".format(self.p1,self.p2)
else:
input_data = "-r {}".format(self.p1)
size = self.genome_size()
if size<30000:
size = 30000
#MODIFIED, since lighter in external_tool doesn't work.
#{path}/external_tools/Lighter-master/lighter
command = "/home/jhuang/anaconda3/bin/lighter {input} -K 20 {size} -od {out}/lighter -t {cpu} 2>> {out}/vrap.log".format(path=self.path,input=input_data,out=self.out,cpu=self.cpu,size=size)
print(command)
os.system(command)
if self.paired_end:
b1 = os.path.basename(self.p1).split(".")[0]
self.cor1 = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0]
b2 = os.path.basename(self.p2).split(".")[0]
self.cor2 = glob.glob("{}/lighter/{}*cor*".format(self.out,b2))[0]
else:
b1 = os.path.basename(self.single).split(".")[0]
self.single = glob.glob("{}/lighter/{}*cor*".format(self.out,b1))[0]
def flash (self):
""" Run flash for paired end reads """
self.log("START flash")
if self.paired_end:
command = "{path}/external_tools/FLASH-1.2.11/flash {cor1} {cor2} -o flash -d {out}/flash -M 1000 >> {out}/vrap.log".format(path=self.path,cor1=self.cor1,cor2=self.cor2,out=self.out)
print(command)
os.system(command)
self.cor1 = "{}/flash/flash.notCombined_1.fastq".format(self.out)
self.cor2 = "{}/flash/flash.notCombined_2.fastq".format(self.out)
self.single = "{}/flash/flash.extendedFrags.fastq".format(self.out)
def bowtie (self):
""" Map reads to reference sequence """
bowtie2 = Bowtie(self.gbt2,self.out,self.cpu)
bowtie_path = "{}/bowtie".format(self.out)
if not os.path.exists(bowtie_path):
os.makedirs(bowtie_path)
if self.bt2idx==None and self.host!=None:
self.log("START bowtie2-build")
self.bt2idx = "{}/bowtie/host".format(self.out)
bowtie2.build(self.host)
if self.bt2idx != None:
if self.paired_end:
self.log("START bowtie2 pe")
bowtie2.pe(self.cor1,self.cor2,self.single,self.bt2idx)
self.cor1 = "{}/bowtie/bowtie.un.1.fastq".format(self.out)
self.cor2 = "{}/bowtie/bowtie.un.2.fastq".format(self.out)
self.single = "{}/bowtie/bowtie.un.fastq".format(self.out)
else:
self.log("START bowtie2 se")
bowtie2.se(self.single,self.bt2idx)
self.single = "{}/bowtie/bowtie.un.fastq".format(self.out)
def update_database (self):
""" Update viral DB"""
#self.log("START update viral DB")
#EXPLANATION: maybe it needs to DELETE the directory viral_db.
#typs = ["nucleotide"]
#typs = ["protein"]
typs = ["nucleotide","protein"]
#path = os.path.dirname(os.path.realpath(__file__))
db_path = "{}/database/viral_db/".format(self.path)
if not os.path.exists(db_path):
os.makedirs(db_path)
for typ in typs:
self.log("START update viral {} DB".format(typ))
database = Database(typ,db_path,self.cpu)
database.download_id()
database.start_download()
blast = Blast(self.cpu, self.out)
db_type = "nucl"
if "protein" in typ:
db_type = "prot"
self.log("\nSTART {} makeblastdb".format(typ))
blast.makeblastdb(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type)
#print(db_path+typ+".fa", "{}viral_{}".format(db_path,typ), db_type)
#/home/jhuang/Tools/vrap/database/viral_db/nucleotide.fa /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide nucl
#MANUAL_RUNNING
#V5 didn't suitable
#/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in nuleotide.fa -dbtype nucl
#/home/jhuang/DATA/RNAHiSwitch/ncbi-blast-2.11.0+/bin/makeblastdb -in protein.fa -dbtype prot
#USING_V4
#https://www.biostars.org/p/200742/
#vrap/external_tools/blast/makeblastdb -in nucleotide.fa -dbtype nucl -parse_seqids -out virus_nucleotide #-taxid_map nucl_gb.accession2taxid
#vrap/external_tools/blast/makeblastdb -in protein.fa -dbtype prot -parse_seqids -out virus_protein #-taxid_map nucl_gb.accession2taxid
#search_term="txid2697049[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date)
#virus_search_term="txid10239[Organism:exp] NOT txid2[Organism:exp] NOT txid2759[Organism:exp] NOT txid2157[Organism:exp] {}".format(date)
#txid2=Bacteria, txid2759=Eukaryota, txid2157=Archaea, txid10239=Viruses, txid2697049=Severe acute respiratory syndrome coronavirus 2
def spades (self):
""" Run spades """
self.log("START spades")
paired = ""
if self.paired_end:
paired = "-1 {} -2 {}".format(self.cor1,self.cor2)
k = "-k 33,55,77,99,127"
options = "--cov-cutoff off --only-assembler --careful"
#options = "--cov-cutoff off --only-assembler"
single = self.single
if len(self.single)>0:
single = "--s1 {}".format(self.single)
cpu = "-t {}".format(self.cpu)
output = "-o {}/spades".format(self.out)
os.system("{path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py {paired} {single} {k} {options} {cpu} {output} >> {out}/vrap.log".format(path=self.path, paired=paired, single=single, k=k, options=options, cpu=cpu, output=output,out=self.out))
#for assembly only
self.contigs = "{}/spades/contigs.fasta".format(self.out)
self.exclude = set()
self.scaffold = {}
def cap3 (self):
""" Runs cap3 """
self.log("START cap3")
self.contigs = "{}/spades/contigs.fasta".format(self.out)
cap3_out = "{}/cap3".format(self.out)
#print(cap3_out)
if not os.path.exists(cap3_out):
os.makedirs(cap3_out)
os.system("{}/external_tools/cap3 '{}' -y 100 >{}/cap3/cap3.out".format(self.path,self.contigs,self.out))
b = False
i = 0
consensi = {}
consensus = ""
group = {}
name = ""
exclude = set()
with open("{}/cap3/cap3.out".format(self.out)) as f:
for line in f:
if not b:
if "*******************" in line:
name = line.split("* ")[1].split(" *")[0]
#print(name)
group[name] = set()
if "NODE" in line:
ary = line.split()
for n in ary:
if "NODE" in n:
tmp = n.replace("+","").replace("-","")
group[name].add(tmp)
exclude.add(tmp)
if "DETAILED" in line:
b = True
if b:
if "*******************" in line:
name = line.split("* ")[1].split(" *")[0]
#print(name)
ary = line.split()
#print(ary)
if len(ary) > 0:
if "consensus" in line:
seq = ary[1]
tmp = consensi.get(name,"")
tmp+=seq.replace("-","N")
consensi[name] = tmp
self.scaffold = consensi
self.exclude = exclude
def clean_output (self):
""" Clean spades output and include scaffolds """
fasta_sequences = SeqIO.parse(open(self.contigs),'fasta')
self.result = "{}/vrap_contig.fasta".format(self.out)
output = open(self.result,"w")
for fasta in fasta_sequences:
if fasta.name not in self.exclude and len(fasta.seq)>=self.contig_length:
output.write(">{}\n{}\n".format(fasta.name,fasta.seq))
i = 1
for name,seq in self.scaffold.items():
if len(seq)>=self.contig_length:
output.write(">CAP_{}_length_{}\n{}\n".format(i,len(seq),seq))
i+=1
output.close()
def calculate_orf_density (self):
""" calculate orf density """
self.log("START calculating orf density")
self.result = "{}/vrap_contig.fasta".format(self.out)
orf = ORF(self.result,self.out)
self.orf_density = orf.orf_density()
def label_as_virus (self):
""" Add virus tag to name """
out_name = "{}/blast/custom_viral_seq.fa".format(self.out)
output = open(out_name,"w")
with open(self.virus, "r") as handle:
for record in SeqIO.parse(handle, "fasta"):
output.write(">{}\n{}\n".format(record.name+" [virus_user_db]",record.seq))
output.close()
return out_name
def blast (self):
""" blast """
self.log("START blast")
self.result = "{}/vrap_contig.fasta".format(self.out)
blast_path = "{}/blast".format(self.out)
blast = Blast(self.cpu, self.out)
blast.read_fasta(self.result)
#EXPLANATION: all virus and host-databases are saved in nt_dbs
# blast performed only after assembly(self.result=vrap_contig.fasta) against host+virus -->
# since the host-reads are not deleted before the assembly,
# vrap_contigs.fasta should contain contigs of host and virus
#sort blast/blastn.csv and blast/blastn.fa
nt_dbs = "{}/database/viral_db{}/viral_nucleotide".format(self.path,old)
e_val = "1e-4"
max_target = 1
if not os.path.exists(blast_path):
os.makedirs(blast_path)
if not os.path.exists(blast_path+"/db"):
os.makedirs(blast_path+"/db")
#EXPLANATION
#if the option --virus is set, make the given fasta blast-format and add it to vrap_out/blast/db/virus
#NO record added! nt_dbs should keep only 1 record!
if self.virus != None:
custom_viral_seq = self.label_as_virus()
blast.makeblastdb(custom_viral_seq,"{}/blast/db/virus".format(self.out),"nucl")
nt_dbs += " {}/blast/db/virus".format(self.out)
#MODIFIED, add the self.host causing error. COMMENTED them!
#EXPLANATION: if the option --host is set, make the given fasta blast-format and add it to vrap_out/blast/db/host
#There are in total 3 nt_dbs, 1 prot_db, +[nt][nr], and 1 bowtie database:
# nt_dbs /home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide, ./blast/db/virus, ./blast/db/host
# prot_db /home/jhuang/Tools/vrap/database/viral_db/viral_protein
# bowtie_database: ./bowtie/host for substraction host-reads
#DEBUG: /home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 4 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/db/host" -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1400_Liquor_vrap_out/blast/blastn.csv
#--> it works!
#Warning: [blastn] Examining 5 or more matches is recommended
#grep -v "scrofa" blastn.csv > blastn_.csv;
#DEBUG: BLAST Database creation error: Error: Duplicate seq_ids are found: GB|MF170920.1
#if self.host != None:
# blast.makeblastdb(self.host,"{}/blast/db/host".format(self.out),"nucl")
# nt_dbs += " {}/blast/db/host".format(self.out)
#EXPLANATION: blastn.csv is empty means blastn finds nonthing!
##nucleotide db search
blast_out = "{}/blast/blastn.csv".format(self.out)
blast.blastn(self.result, nt_dbs, e_val, max_target, blast_out)
no_match_fa = "{}/blast/blastn.fa".format(self.out)
no_results_found = blast.no_match(blast_out,no_match_fa)
#EXPLANATION: blastx -num_threads 14 -query /home/jhuang/DATA/Data_Becher_Picornavirus_BovHepV/F21_1399_1400_Serum_vrap_out/blast/blastn.fa -db /home/jhuang/Tools/vrap/database/viral_db/viral_protein -evalue 1e-6 -outfmt 6 qseqid qstart qend sstart send evalue le+
##protein db search
if no_results_found > 0:
e_val = "1e-6"
blast_out = "{}/blast/blastx.csv".format(self.out)
prot_db = "{}/database/viral_db{}/viral_protein".format(self.path,old)
blast.blastx(no_match_fa, prot_db, e_val, max_target, blast_out)
no_match_fa = "{}/blast/blastx.fa".format(self.out)
no_results_found = blast.no_match(blast_out,no_match_fa)
#EXPLANATION: since self.nt and self.nr are not given, the following code won't run!
if self.nt != None and no_results_found > 0:
e_val = "1e-4"
blast_out = "{}/blast/blastn_nt.csv".format(self.out)
blast.blastn(no_match_fa, self.nt, e_val, max_target, blast_out)
no_match_fa = "{}/blast/blastn_nt.fa".format(self.out)
no_results_found = blast.no_match(blast_out,no_match_fa)
if self.nr != None and no_results_found > 0:
e_val = "1e-6"
blast_out = "{}/blast/blastx_nr.csv".format(self.out)
blast.blastx(no_match_fa, self.nr, e_val, max_target, blast_out)
no_match_fa = "{}/blast/blastx_nr.fa".format(self.out)
no_results_found = blast.no_match(blast_out,no_match_fa)
def hmmer (self):
""" hidden markov modell search """
self.log("START HMMER")
hmmer = HMMER(self.cpu, self.out)
hmmer.run_hmmer()
def summary (self):
""" summarize output """
if len(self.orf_density)>0:
blast = Blast(self.cpu, self.out)
self.blast_results = blast.summary(self.out)
hmmer = HMMER(self.cpu, self.out)
hmmer_output = hmmer.read_hmmer_output()
with open("{}/vrap_summary{}.csv".format(self.out,old),"w") as f:
f.write("name;qleng;orf_d;hmmer_eval;hmm_model;virus?;ident;qcov;tcov;tlength;tid;tname;mean_eval\n")
for name,orf_d in self.orf_density.items():
results = {"qlength":0,"tid":"-","tname":"-","tlength":0,"qcovarage":0,"ident":0,"tcovarage":0,"eval":1}
if name in self.blast_results:
results = self.blast_results[name]
hmmer = 1
hmm_model = ""
if name in hmmer_output:
hmmer,hmm_model = max(hmmer_output[name],key=lambda x:x[0])
qleng = int(name.split("_")[3])
virus = "host"
if "virus" in results["tname"] or "phage" in results["tname"]:
virus = "virus"
else:
if (len(results["tname"])<2):
if ((orf_d>0.9 and qleng <=1000) or (orf_d>0.7 and qleng >1000)):
virus = "potential virus"
else:
virus = "-"
if hmmer<=1e-5:
virus = "virus"
tmp ="{0};{1:d};{2:.2f};{3:.2e};{4};{5};{6:.2f};{7:.2f};{8:.2f};{9:d};{10};{11};{12:.2e}\n".format(name,qleng,orf_d,hmmer,hmm_model,virus,results["ident"],results["qcovarage"],results["tcovarage"],results["tlength"],results["tid"],results["tname"].replace(";",","),results["eval"])
f.write(tmp)
vrap = VrAP()
try:
if not vrap.assembly:
vrap.lighter()
vrap.flash()
if not vrap.pre:
if not vrap.assembly:
vrap.bowtie()
vrap.spades()
if not vrap.assembly:
vrap.cap3()
vrap.clean_output()
vrap.calculate_orf_density()
if len(vrap.orf_density)>0:
vrap.hmmer()
if not vrap.noblast:
vrap.blast()
vrap.summary()
vrap.log("VrAP pipeline finished.\nThank you for using VrAP!")
except Exception as e:
vrap.log(e,True)
点赞本文的读者
还没有人对此文章表态
没有评论
RNA-seq Tam on CP059040.1 (Acinetobacter baumannii strain ATCC 19606)
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
Typing of 81 S. epidermidis samples (Luise)
Co-Authorship Network Generator using scraped data from Google Scholar via SerpAPI
© 2023 XGenes.com Impressum