Viral variant calling

gene_x 0 like s 274 view s

Tags: pipeline

1: trimming using trimmomatic

    mkdir trimmed bams
    for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
        java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 ./raw_data/${sample}_R1.fastq.gz ./raw_data/${sample}_R2.fastq.gz trimmed/${sample}_R1.fastq.gz trimmed/${sample}_unpaired_R1.fastq.gz trimmed/${sample}_R2.fastq.gz trimmed/${sample}_unpaired_R2.fastq.gz  ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; \
    done

    #(optional) #The reference alignment can be downloaded from the ViPR site:
    #https://www.viprbrc.org/brc/workbenchSequenceSearch.spg?#uploadedFileId=20272&decorator=flavi&method=SubmitForm
    ##-- deduplicate fasta --
    ##awk 'BEGIN{RS=">"}NR>1{sub("\n","\t"); gsub("\n",""); print RS$0}' RSV.fa | awk '!seen[$0]++' | awk -v OFS="\n" '{for(i=2;i<NF;i++) head = head " " $i; print $1 " " head,$NF; head = ""}'
    ##sed -e '/^>/s/$/@/' -e 's/^>/#/' RSV.fa | tr -d '\n' | tr "#" "\n" | tr "@" "\t" | sort -u -k1,1 | sed -e 's/^/>/' -e 's/\t/\n/' > RSV_dedup.fa
    #from Bio import SeqIO
    #with open('RSV_dedup.fa', 'a') as outFile:
    #    record_ids = list()
    #    for record in SeqIO.parse('RSV.fa', 'fasta'):
    #        if record.id not in record_ids:
    #            record_ids.append( record.id )
    #            SeqIO.write(record, outFile, 'fasta')
  1. mapping
    mv PP810610.1.fasta PP810610.1.fa
    ref_fa="PP810610.1.fa";
    # raw mapping
    for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
        bwa index ${ref_fa}; \
        bwa mem -M -t 16 ${ref_fa} trimmed/${sample}_R1.fastq.gz trimmed/${sample}_R2.fastq.gz | samtools view -bS - > bams/${sample}_genome_alignment.bam; \
        #for table filling using the following commands! -->3000000 \
        #bwa mem -M -t 14 ${ref_fa} ${sample}_R1.fastq.gz ${sample}_R2.fastq.gz | samtools view -bS -F 256 - > bams/${sample}_uniqmap.bam; \
    done
    

3: AddOrReplaceReadGroup is IMPORTANT step, otherwise the step viral_ngs cannot run correctly

    #MODIFIED
    #default_jvm_mem_opts="-Xms512m -Xmx1g"
    #--> default_jvm_mem_opts="-Xms256g -Xmx512g"
    for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
    picard AddOrReplaceReadGroups I=bams/${sample}_genome_alignment.bam O=data/00_raw/${sample}.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=$sample RGSM=$sample RGLB=standard RGPU=$sample VALIDATION_STRINGENCY=LENIENT; \
    done
  1. set values in samples-*.txt before running viral-ngs

    ln -s ~/Tools/viral-ngs/Snakefile Snakefile
    ln -s  ~/Tools/viral-ngs/bin bin
    cp  ~/Tools/viral-ngs/refsel.acids refsel.acids
    cp  ~/Tools/viral-ngs/lastal.acids lastal.acids
    cp  ~/Tools/viral-ngs/config.yaml config.yaml
    cp  ~/Tools/viral-ngs/samples-runs.txt samples-runs.txt
    cp  ~/Tools/viral-ngs/samples-depletion.txt samples-depletion.txt
    cp  ~/Tools/viral-ngs/samples-metagenomics.txt samples-metagenomics.txt
    cp  ~/Tools/viral-ngs/samples-assembly.txt samples-assembly.txt
    cp  ~/Tools/viral-ngs/samples-assembly-failures.txt samples-assembly-failures.txt
    mkdir data
    cd data
    mkdir 00_raw
    cd ../..
    
    conda update --all
    
    #all bin/tools can be installed automatically via scritps:
    conda deactivate
    
    #This script installs every Tool needed by viral-ngs
    bin/easy-deploy-script/easy-deploy-viral-ngs.sh setup
    sudo apt update
    sudo apt install curl
    #environment location: /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/mc3
    #DEBUG: I installed a environment using script under the environment location: /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/mc3, how to activate the new installed env mc3?
    
    #viral-ngs parent directory found
    #Activating viral-ngs environment...
    #Miniconda installed.
    #Prepending miniconda to PATH...
    #Linking to current viral-ngs install...
    #GATK jar could not be found on this system for GATK version 3.8
    #Please activate the viral-ngs conda environment and 'gatk-register /path/to/GenomeAnalysisTK.jar'
    
    ## Initialize Conda if not already done
    #conda init
    #source ~/.bashrc  # or source ~/.zshrc
    
    # Activate the new environment
    conda activate /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env
    
    # Verify the active environment
    conda env list
                          *  /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env
    base                     /home/jhuang/miniconda3
    viral-ngs-env            /home/jhuang/miniconda3/envs/viral-ngs-env
    viral-ngs-env-py37       /home/jhuang/miniconda3/envs/viral-ngs-env-py37
    
    conda install -c bioconda gatk
    gatk3 -version
    #3.8-1-0-gf15c1c3ef
    
    conda list
    #conda install -c conda-forge -c bioconda -c defaults biopython pysam bmtagger picard pybedtools
    #cdhit
    mamba install -c conda-forge -c bioconda -c defaults biopython pyyaml muscle=3.8 trinity tbl2asn snpeff=4.1 megan spades last  vphaser2 blast mvicuna diamond bwa krona bmtagger gap2seq mummer mummer4 kraken fastqc mafft picard prinseq openjdk=8 python=3.6 matplotlib
    #Manually refers to the following tools to the location during hard-coded, don't need to be install: novoalign trimmomatic gatk samtools
    
    #MODIFIED
    bin/tools/spades.py          #hard-coded /usr/bin/spades.py
    bin/tools/mummer.py
    bin/tools/novoalign.py       #hard-coded Novoalign path: /home/jhuang/Tools/novocraft_v3/novoalign
    #in config.yaml
    NOVOALIGN_PATH: "/home/jhuang/Tools/novocraft_v3"
    bin/tools/trimmomatic.py     #hard-coded Trimmomatic path: /usr/local/bin/trimmomatic
    bin/tools/gatk.py: /usr/local/bin/gatk  #!!!!!!! BIG_BIG_BIG_BIG_BUG !!!!!!!: JAVA only with 1.8 + conda install openjdk=8 and /home/jhuang/Tools/SPANDx_v3.2/GenomeAnalysisTK.jar (version 3.2-2-gec30cee) + "mamba install python=3.8"
    #---> using python=3.8, NO ERROR generated in "bin/reports.py consolidate_fastqc reports/fastqc/hCoV229E_Rluc/cleaned reports/fastqc/p10_DMSO/cleaned reports/fastqc/p10_K22/cleaned reports/fastqc/p10_K7523/cleaned reports/summary.fastqc.cleaned.txt"
    bin/assembly.py              #'Seq' object has no attribute 'ungap', solution len(seg.seq.ungap('N'))-->len(seg.seq.replace("N", ""))
    bin/intrahost.py             #'Seq' object has no attribute 'ungap', solution len(seq.seq.ungap('-'))-->len(seq.seq.replace("-", ""))
    bin/reports.py
    bin/samtools.py              #hard-coded /home/jhuang/Tools/samtools-1.9/samtools
    ## Install samtools 1.9
    #wget https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2
    #tar -xvjf samtools-1.9.tar.bz2
    #cd samtools-1.9
    #./configure
    #make
    #sudo make install
    picard.py currently does not to modify! #Picard 2.20 has warning, but still working! ********** NOTE: Picard's command line syntax is changing.
    
    #DEBUG1: a pretty env is automatically installed: "/home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env"? with bin/tools/__init__.py
    #  "prefix": "/home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env",
    #  "success": true
    
    #DEBUG2: What did the following command?
    /home/jhuang/miniconda3/bin/python /home/jhuang/miniconda3/condabin/conda remove -q -y --json -p /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env mafft[snpeff]
    
    #DEBUG3: Possibly need to manually annotate the vcf file with snpEff /home/jhuang/Tools/SPANDx_v3.2/snpEff/snpEff.jar
    #bin/interhost.py snpEff inVcf=data/04_intrahost/isnvs.vcf.gz genomes=['PP810610'] outVcf=data/04_intrahost/isnvs.annot2.vcf.gz emailAddress=j.huang@uke.de loglevel=DEBUG
    mkdir ~/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/share/snpeff-5.1-0/data/PP810610
    cp PP810610.1.gb ~/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/share/snpeff-5.1-0/data/PP810610/genes.gbk
    vim ~/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/share/snpeff-5.1-0/snpEff.config
    /home/jhuang/Tools/viral-ngs/bin/easy-deploy-script/viral-ngs-etc/conda-env/bin/snpEff build -genbank PP810610      -d
    #-t
    snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v CP040849.1 noAB_wildtype_trimmed.PASS.snps.vcf > noAB_wildtype_trimmed.PASS.snps.annotated.vcf
    
    snakemake --printshellcmds --cores 50
    
  2. get statistics from snakemake-output

    samtools flagstat data/02_align_to_self/hCoV229E_Rluc.mapped.bam
    samtools flagstat data/02_align_to_self/p10_DMSO.mapped.bam
    samtools flagstat data/02_align_to_self/p10_K22.mapped.bam
    samtools flagstat data/02_align_to_self/p10_K7523.mapped.bam
    
    671888 + 0 properly paired (99.65% : N/A)
    739432 + 0 properly paired (99.66% : N/A)
    496190 + 0 properly paired (99.61% : N/A)
    549030 + 0 properly paired (99.60% : N/A)
    
  3. generate variant_annot.xls and coverages.xls

    # -- generate isnvs_annot_complete__.txt, isnvs_annot_0.05.txt from ~/DATA/Data_Pietschmann_RSV_Probe3/data/04_intrahost
    cp isnvs.annot.txt isnvs.annot_complete.txt
    ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs.annot_complete.txt -d$'\t' -o isnvs.annot_complete.xls
    #delete the columns patient, time, Hw and Hs and the header in the xls and save as txt file.
    
    awk '{printf "%.3f\n", $5}' isnvs.annot_complete.csv > f5
    cut -f1-4 isnvs.annot_complete.csv > f1_4
    cut -f6- isnvs.annot_complete.csv > f6_
    paste f1_4 f5 > f1_5
    paste f1_5 f6_ > isnvs_annot_complete_.txt
    cat header isnvs_annot_complete_.txt > isnvs_annot_complete__.txt
    ~/Tools/csv2xls-0.4/csv_to_xls.py isnvs_annot_complete__.txt -d$'\t' -o variant_annot.xls
    
    #MANUALLY generate variant_annot_0.05.csv
    
    #[OPTIONAL]: automatically generate the file variant_annot_0.05.csv
    #awk ' $5 >= 0.05 ' isnvs_annot_complete__.txt > 0.05.csv
    cut -f2-2 xP0/0.05.csv > xP0_ids
    cut -f2-2 xDMSO/0.05.csv > xDMSO_ids
    cut -f2-2 xComp28/0.05.csv > xComp28_ids
    cut -f2-2 xComp29/0.05.csv > xComp29_ids
    cut -f2-2 xComp32/0.05.csv > xComp32_ids
    cut -f2-2 xLona/0.05.csv > xLona_ids
    cat *_ids | sort -u -n > ids
    replace \n with \\t" isnvs_annot_complete__.txt >> isnvs_annot_0.01.txt\ngrep -P "MK816924\\t  in ids
    mv ids get_0.02.sh
    ~/Tools/csv2xls-0.4/csv_to_xls.py variant_annot_0.05.csv isnvs_annot_complete__.txt -d$'\t' -o variant_annot.xls
    
    # -- calculate the coverage
    samtools depth ./data/02_align_to_self/hCoV229E_Rluc.mapped.bam > hCoV229E_Rluc_cov.txt
    samtools depth ./data/02_align_to_self/p10_DMSO.mapped.bam > p10_DMSO_cov.txt
    samtools depth ./data/02_align_to_self/p10_K22.mapped.bam > p10_K22_cov.txt
    samtools depth ./data/02_align_to_self/p10_K7523.mapped.bam > p10_K7523_cov.txt
    ~/Tools/csv2xls-0.4/csv_to_xls.py hCoV229E_Rluc_cov.txt p10_DMSO_cov.txt p10_K22_cov.txt p10_K7523_cov.txt -d$'\t' -o coverages.xls
    
  4. using bengal3_ac3 calling variants (not useful)

    git clone https://github.com/huang/bacto
    mv bacto/* ./
    rm -rf bacto
    #prepare raw_data and bacto-0.1.json
    conda activate bengal3_ac3
    /home/jhuang/miniconda3/envs/snakemake_4_3_1/bin/snakemake --printshellcmds
    
    CHR     POS     REF     hCoV229E_Rluc   p10_DMSO        p10_K22 p10_K7523
    PP810610        1464    T       C       C       C       C
    PP810610        1699    C       T       T       T       T
    PP810610        6691    C       T       T       T       T
    PP810610        6919    C       G       G       G       G
    PP810610        6922    A       G       G       G       G
    PP810610        6925    G       C       C       C       C
    PP810610        7294    T       A       A       A       A
    PP810610        14472   T       C       C       C       C
    PP810610        15458   T       C       C       C       C
    PP810610        16035   C       A       A       A       A
    PP810610        17430   T       C       C       C       C
    PP810610        19289   G       G       G       T       G
    PP810610        21183   T       G       G       G       G
    PP810610        22636   T       G       G       G       G
    PP810610        23022   T       C       C       C       C
    PP810610        24781   C       T       T       T       T
    PP810610        25163   C       T       T       T       T
    PP810610        25264   C       T       T       T       T
    PP810610        26838   G       T       T       T       T
    
  5. using spandx calling variants (almost the same results to the one from viral-ngs!)

    mkdir ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610
    cp PP810610.1.gb  ~/miniconda3/envs/spandx/share/snpeff-5.1-2/data/PP810610/genes.gbk
    vim ~/miniconda3/envs/spandx/share/snpeff-5.1-2/snpEff.config
    /home/jhuang/miniconda3/envs/spandx/bin/snpEff build  PP810610     -d
    gzip hCoV229E_Rluc_trimmed_P_1.fastq hCoV229E_Rluc_trimmed_P_2.fastq p10_DMSO_trimmed_P_1.fastq p10_DMSO_trimmed_P_2.fastq p10_K22_trimmed_P_1.fastq p10_K22_trimmed_P_2.fastq p10_K7523_trimmed_P_1.fastq p10_K7523_trimmed_P_2.fastq
    ln -s /home/jhuang/Tools/spandx/ spandx
    (spandx) nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref PP810610.fasta --annotation --database PP810610 -resume
    
    ->PP810610  1492    T   A   SNP T/A T/A T/A T/A     MODIFIER
    ->PP810610  8289    C   A   SNP C/A C/A C   C/A     MODIFIER
    ->PP810610  8294    A   G   SNP A/G A   A/G A       MODIFIER
    PP810610    8376    G   T   SNP G/T G   G   G       MODIFIER
    PP810610    9146    T   C   SNP T   T   T   T/C     MODIFIER
    ->PP810610  9174    G   A   SNP G   G   G   G/A     MODIFIER
    PP810610    10145   A   G   SNP A   A   A   A/G     MODIFIER
    ->PP810610  10239   T   G   SNP T   T   T/G T       MODIFIER
    ->PP810610  10310   G   A   SNP G   G   G   G/A     MODIFIER
    ->PP810610  10871   C   T   SNP C   C/T T   C/T     MODIFIER
    ->PP810610  10898   G   A   SNP G   G/A G   G/A     MODIFIER
    ->PP810610  11577   A   C   SNP A   A/C A   A       MODIFIER
    PP810610    18640   T   G   SNP T   T   T   T/G     MODIFIER
    ->PP810610  18646   C   T   SNP C   C   C   C/T     MODIFIER
    PP810610    18701   A   G   SNP A   A   A   A/G     MODIFIER
    PP810610    19028   C   T   SNP C   C   C   C/T     MODIFIER
    PP810610    19289   G   T   SNP G   G   T   G       MODIFIER
    -->PP810610 21027   C   T   SNP C   C/T C   C/T     MODIFIER
    ->PP810610  21633   T   C   SNP T   T/C T   T       MODIFIER
    ->PP810610  22215   T   G   SNP T   T   T   T/G     MODIFIER
    ->PP810610  23435   C   T   SNP C   C   T   C/T     MODIFIER
    PP810610    24738   C   *,A SNP C   C   */A C/A     MODIFIER
    PP810610    25025   C   T   SNP C   C/T C   C       MODIFIER
    ->PP810610  25592   T   C   SNP T   T/C T   T       MODIFIER
    
  6. Consensus sequences of each and of all isolates

    cat PP810610.1.fa OZ035258.1.fa MZ712010.1.fa OK662398.1.fa OK625404.1.fa KF293664.1.fa NC_002645.1.fa > all.fa
    cp data/02_assembly/*.fasta ./
    for sample in hCoV229E_Rluc p10_DMSO p10_K22 p10_K7523; do \
    mv ${sample}.fasta ${sample}.fa
    cat all.fa ${sample}.fa >> all.fa
    done
    
    cat RSV_dedup.fa all.fa > RSV_all.fa
    mafft --clustalout --adjustdirection RSV_all.fa > RSV_all.aln
    snp-sites RSV_all.aln -o RSV_all_.aln
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum