gene_x 0 like s 274 view s
Tags: pipeline
1: trimming using trimmomatic
mkdir trimmed bams
for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 ./raw_data/${sample}_R1.fastq.gz ./raw_data/${sample}_R2.fastq.gz trimmed/${sample}_R1.fastq.gz trimmed/${sample}_unpaired_R1.fastq.gz trimmed/${sample}_R2.fastq.gz trimmed/${sample}_unpaired_R2.fastq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; \
done
#(optional) #The reference alignment can be downloaded from the ViPR site:
#https://www.viprbrc.org/brc/workbenchSequenceSearch.spg?#uploadedFileId=20272&decorator=flavi&method=SubmitForm
##-- deduplicate fasta --
##awk 'BEGIN{RS=">"}NR>1{sub("\n","\t"); gsub("\n",""); print RS$0}' RSV.fa | awk '!seen[$0]++' | awk -v OFS="\n" '{for(i=2;i<NF;i++) head = head " " $i; print $1 " " head,$NF; head = ""}'
##sed -e '/^>/s/$/@/' -e 's/^>/#/' RSV.fa | tr -d '\n' | tr "#" "\n" | tr "@" "\t" | sort -u -k1,1 | sed -e 's/^/>/' -e 's/\t/\n/' > RSV_dedup.fa
#from Bio import SeqIO
#with open('RSV_dedup.fa', 'a') as outFile:
# record_ids = list()
# for record in SeqIO.parse('RSV.fa', 'fasta'):
# if record.id not in record_ids:
# record_ids.append( record.id )
# SeqIO.write(record, outFile, 'fasta')
mv PP810610.1.fasta PP810610.1.fa
ref_fa="PP810610.1.fa";
# raw mapping
for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
bwa index ${ref_fa}; \
bwa mem -M -t 16 ${ref_fa} trimmed/${sample}_R1.fastq.gz trimmed/${sample}_R2.fastq.gz | samtools view -bS - > bams/${sample}_genome_alignment.bam; \
#for table filling using the following commands! -->3000000 \
#bwa mem -M -t 14 ${ref_fa} ${sample}_R1.fastq.gz ${sample}_R2.fastq.gz | samtools view -bS -F 256 - > bams/${sample}_uniqmap.bam; \
done
3: AddOrReplaceReadGroup is IMPORTANT step, otherwise the step viral_ngs cannot run correctly
#MODIFIED
#default_jvm_mem_opts="-Xms512m -Xmx1g"
#--> default_jvm_mem_opts="-Xms256g -Xmx512g"
for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
picard AddOrReplaceReadGroups I=bams/${sample}_genome_alignment.bam O=data/00_raw/${sample}.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=$sample RGSM=$sample RGLB=standard RGPU=$sample VALIDATION_STRINGENCY=LENIENT; \
done
set values in samples-*.txt before running viral-ngs
ln -s ~/Tools/viral-ngs/Snakefile Snakefile
ln -s ~/Tools/viral-ngs/bin bin
cp ~/Tools/viral-ngs/refsel.acids refsel.acids
cp ~/Tools/viral-ngs/lastal.acids lastal.acids
cp ~/Tools/viral-ngs/config.yaml config.yaml
cp ~/Tools/viral-ngs/samples-runs.txt samples-runs.txt
cp ~/Tools/viral-ngs/samples-depletion.txt samples-depletion.txt
cp ~/Tools/viral-ngs/samples-metagenomics.txt samples-metagenomics.txt
cp ~/Tools/viral-ngs/samples-assembly.txt samples-assembly.txt
cp ~/Tools/viral-ngs/samples-assembly-failures.txt samples-assembly-failures.txt
mkdir data
cd data
mkdir 00_raw
cd ../..
conda update --all
#all bin/tools can be installed automatically via scritps:
conda deactivate
#This script installs every Tool needed by viral-ngs
bin/easy-deploy-script/easy-deploy-viral-ngs.sh setup
sudo apt update
sudo apt install curl
#environment location: /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/mc3
#DEBUG: I installed a environment using script under the environment location: /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/mc3, how to activate the new installed env mc3?
#viral-ngs parent directory found
#Activating viral-ngs environment...
#Miniconda installed.
#Prepending miniconda to PATH...
#Linking to current viral-ngs install...
#GATK jar could not be found on this system for GATK version 3.8
#Please activate the viral-ngs conda environment and 'gatk-register /path/to/GenomeAnalysisTK.jar'
## Initialize Conda if not already done
#conda init
#source ~/.bashrc # or source ~/.zshrc
# Activate the new environment
conda activate /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env
# Verify the active environment
conda env list
* /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env
base /home/jhuang/miniconda3
viral-ngs-env /home/jhuang/miniconda3/envs/viral-ngs-env
viral-ngs-env-py37 /home/jhuang/miniconda3/envs/viral-ngs-env-py37
conda install -c bioconda gatk
gatk3 -version
#3.8-1-0-gf15c1c3ef
conda list
#conda install -c conda-forge -c bioconda -c defaults biopython pysam bmtagger picard pybedtools
#cdhit
mamba install -c conda-forge -c bioconda -c defaults biopython pyyaml muscle=3.8 trinity tbl2asn snpeff=4.1 megan spades last vphaser2 blast mvicuna diamond bwa krona bmtagger gap2seq mummer mummer4 kraken fastqc mafft picard prinseq openjdk=8 python=3.6 matplotlib
#Manually refers to the following tools to the location during hard-coded, don't need to be install: novoalign trimmomatic gatk samtools
#MODIFIED
bin/tools/spades.py #hard-coded /usr/bin/spades.py
bin/tools/mummer.py
bin/tools/novoalign.py #hard-coded Novoalign path: /home/jhuang/Tools/novocraft_v3/novoalign
#in config.yaml
NOVOALIGN_PATH: "/home/jhuang/Tools/novocraft_v3"
bin/tools/trimmomatic.py #hard-coded Trimmomatic path: /usr/local/bin/trimmomatic
bin/tools/gatk.py: /usr/local/bin/gatk #!!!!!!! BIG_BIG_BIG_BIG_BUG !!!!!!!: JAVA only with 1.8 + conda install openjdk=8 and /home/jhuang/Tools/SPANDx_v3.2/GenomeAnalysisTK.jar (version 3.2-2-gec30cee) + "mamba install python=3.8"
#---> using python=3.8, NO ERROR generated in "bin/reports.py consolidate_fastqc reports/fastqc/hCoV229E_Rluc/cleaned reports/fastqc/p10_DMSO/cleaned reports/fastqc/p10_K22/cleaned reports/fastqc/p10_K7523/cleaned reports/summary.fastqc.cleaned.txt"
bin/assembly.py #'Seq' object has no attribute 'ungap', solution len(seg.seq.ungap('N'))-->len(seg.seq.replace("N", ""))
bin/intrahost.py #'Seq' object has no attribute 'ungap', solution len(seq.seq.ungap('-'))-->len(seq.seq.replace("-", ""))
bin/reports.py
bin/samtools.py #hard-coded /home/jhuang/Tools/samtools-1.9/samtools
## Install samtools 1.9
#wget https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2
#tar -xvjf samtools-1.9.tar.bz2
#cd samtools-1.9
#./configure
#make
#sudo make install
picard.py currently does not to modify! #Picard 2.20 has warning, but still working! ********** NOTE: Picard's command line syntax is changing.
#DEBUG1: a pretty env is automatically installed: "/home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env"? with bin/tools/__init__.py
# "prefix": "/home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env",
# "success": true
#DEBUG2: What did the following command?
/home/jhuang/miniconda3/bin/python /home/jhuang/miniconda3/condabin/conda remove -q -y --json -p /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env mafft[snpeff]
#DEBUG3: Possibly need to manually annotate the vcf file with snpEff /home/jhuang/Tools/SPANDx_v3.2/snpEff/snpEff.jar
#bin/interhost.py snpEff inVcf=data/04_intrahost/isnvs.vcf.gz genomes=['PP810610'] outVcf=data/04_intrahost/isnvs.annot2.vcf.gz emailAddress=j.huang@uke.de loglevel=DEBUG
mkdir ~/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/share/snpeff-5.1-0/data/PP810610
cp PP810610.1.gb ~/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/share/snpeff-5.1-0/data/PP810610/genes.gbk
vim ~/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/share/snpeff-5.1-0/snpEff.config
/home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/bin/snpEff build -genbank PP810610 -d
#-t
snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v CP040849.1 noAB_wildtype_trimmed.PASS.snps.vcf > noAB_wildtype_trimmed.PASS.snps.annotated.vcf
snakemake --printshellcmds --cores 50
get statistics from snakemake-output
samtools flagstat data/02_align_to_self/hCoV229E_Rluc.mapped.bam
samtools flagstat data/02_align_to_self/p10_DMSO.mapped.bam
samtools flagstat data/02_align_to_self/p10_K22.mapped.bam
samtools flagstat data/02_align_to_self/p10_K7523.mapped.bam
671888 + 0 properly paired (99.65% : N/A)
739432 + 0 properly paired (99.66% : N/A)
496190 + 0 properly paired (99.61% : N/A)
549030 + 0 properly paired (99.60% : N/A)
generate variant_annot.xls and coverages.xls
# -- generate isnvs_annot_complete__.txt, isnvs_annot_0.05.txt from ~/DATA/Data_Pietschmann_RSV_Probe3/data/04_intrahost
cp isnvs.annot.txt isnvs.annot_complete.txt
~/Tools/csv2xls-0.4/csv_to_xls.py isnvs.annot_complete.txt -d$'\t' -o isnvs.annot_complete.xls
#delete the columns patient, time, Hw and Hs and the header in the xls and save as txt file.
awk '{printf "%.3f\n", $5}' isnvs.annot_complete.csv > f5
cut -f1-4 isnvs.annot_complete.csv > f1_4
cut -f6- isnvs.annot_complete.csv > f6_
paste f1_4 f5 > f1_5
paste f1_5 f6_ > isnvs_annot_complete_.txt
cat header isnvs_annot_complete_.txt > isnvs_annot_complete__.txt
~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_complete__.txt -d$'\t' -o variant_annot.xls
#MANUALLY generate variant_annot_0.05.csv
#[OPTIONAL]: automatically generate the file variant_annot_0.05.csv
#awk ' $5 >= 0.05 ' isnvs_annot_complete__.txt > 0.05.csv
cut -f2-2 xP0/0.05.csv > xP0_ids
cut -f2-2 xDMSO/0.05.csv > xDMSO_ids
cut -f2-2 xComp28/0.05.csv > xComp28_ids
cut -f2-2 xComp29/0.05.csv > xComp29_ids
cut -f2-2 xComp32/0.05.csv > xComp32_ids
cut -f2-2 xLona/0.05.csv > xLona_ids
cat *_ids | sort -u -n > ids
replace \n with \\t" isnvs_annot_complete__.txt >> isnvs_annot_0.01.txt\ngrep -P "MK816924\\t in ids
mv ids get_0.02.sh
~/Tools/csv2xls-0.4/csv_to_xls.py variant_annot_0.05.csv isnvs_annot_complete__.txt -d$'\t' -o variant_annot.xls
# -- calculate the coverage
samtools depth ./data/02_align_to_self/hCoV229E_Rluc.mapped.bam > hCoV229E_Rluc_cov.txt
samtools depth ./data/02_align_to_self/p10_DMSO.mapped.bam > p10_DMSO_cov.txt
samtools depth ./data/02_align_to_self/p10_K22.mapped.bam > p10_K22_cov.txt
samtools depth ./data/02_align_to_self/p10_K7523.mapped.bam > p10_K7523_cov.txt
~/Tools/csv2xls-0.4/csv_to_xls.py hCoV229E_Rluc_cov.txt p10_DMSO_cov.txt p10_K22_cov.txt p10_K7523_cov.txt -d$'\t' -o coverages.xls
using bengal3_ac3 calling variants (not useful)
git clone https://github.com/huang/bacto
mv bacto/* ./
rm -rf bacto
#prepare raw_data and bacto-0.1.json
conda activate bengal3_ac3
/home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
CHR POS REF hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523
PP810610 1464 T C C C C
PP810610 1699 C T T T T
PP810610 6691 C T T T T
PP810610 6919 C G G G G
PP810610 6922 A G G G G
PP810610 6925 G C C C C
PP810610 7294 T A A A A
PP810610 14472 T C C C C
PP810610 15458 T C C C C
PP810610 16035 C A A A A
PP810610 17430 T C C C C
PP810610 19289 G G G T G
PP810610 21183 T G G G G
PP810610 22636 T G G G G
PP810610 23022 T C C C C
PP810610 24781 C T T T T
PP810610 25163 C T T T T
PP810610 25264 C T T T T
PP810610 26838 G T T T T
using spandx calling variants (almost the same results to the one from viral-ngs!)
mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
cp PP810610.1.gb ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
/home/jhuang/miniconda3/envs/spandx/bin/snpEff build PP810610 -d
gzip hCoV229E_Rluc_trimmed_P_1.fastq hCoV229E_Rluc_trimmed_P_2.fastq p10_DMSO_trimmed_P_1.fastq p10_DMSO_trimmed_P_2.fastq p10_K22_trimmed_P_1.fastq p10_K22_trimmed_P_2.fastq p10_K7523_trimmed_P_1.fastq p10_K7523_trimmed_P_2.fastq
ln -s /home/jhuang/Tools/spandx/ spandx
(spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref PP810610.fasta --annotation --database PP810610 -resume
->PP810610 1492 T A SNP T/A T/A T/A T/A MODIFIER
->PP810610 8289 C A SNP C/A C/A C C/A MODIFIER
->PP810610 8294 A G SNP A/G A A/G A MODIFIER
PP810610 8376 G T SNP G/T G G G MODIFIER
PP810610 9146 T C SNP T T T T/C MODIFIER
->PP810610 9174 G A SNP G G G G/A MODIFIER
PP810610 10145 A G SNP A A A A/G MODIFIER
->PP810610 10239 T G SNP T T T/G T MODIFIER
->PP810610 10310 G A SNP G G G G/A MODIFIER
->PP810610 10871 C T SNP C C/T T C/T MODIFIER
->PP810610 10898 G A SNP G G/A G G/A MODIFIER
->PP810610 11577 A C SNP A A/C A A MODIFIER
PP810610 18640 T G SNP T T T T/G MODIFIER
->PP810610 18646 C T SNP C C C C/T MODIFIER
PP810610 18701 A G SNP A A A A/G MODIFIER
PP810610 19028 C T SNP C C C C/T MODIFIER
PP810610 19289 G T SNP G G T G MODIFIER
-->PP810610 21027 C T SNP C C/T C C/T MODIFIER
->PP810610 21633 T C SNP T T/C T T MODIFIER
->PP810610 22215 T G SNP T T T T/G MODIFIER
->PP810610 23435 C T SNP C C T C/T MODIFIER
PP810610 24738 C *,A SNP C C */A C/A MODIFIER
PP810610 25025 C T SNP C C/T C C MODIFIER
->PP810610 25592 T C SNP T T/C T T MODIFIER
Consensus sequences of each and of all isolates
cat PP810610.1.fa OZ035258.1.fa MZ712010.1.fa OK662398.1.fa OK625404.1.fa KF293664.1.fa NC_002645.1.fa > all.fa
cp data/02_assembly/*.fasta ./
for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
mv ${sample}.fasta ${sample}.fa
cat all.fa ${sample}.fa >> all.fa
done
cat RSV_dedup.fa all.fa > RSV_all.fa
mafft --clustalout --adjustdirection RSV_all.fa > RSV_all.aln
snp-sites RSV_all.aln -o RSV_all_.aln
点赞本文的读者
还没有人对此文章表态
没有评论
Transposon analyses for the nanopore sequencing
Updated List of nf-core Pipelines (Released) Sorted by Stars (as of November 22, 2024)
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
© 2023 XGenes.com Impressum