Using vrap and bacto to process Acinetobacter baumannii isolates

gene_x 0 like s 321 view s

Tags: pipeline, DNA-seq

  1. input data

    #Deciphering Evolutionary Trajectories in Acinetobacter baumannii: A Comparative Study of Isolates ATCC 19606, AYE, and ATCC 17978
    
    #PRJNA1041744
    
    A6 WT - Acinetobacter baumannii ATCC 19606, CP059040
    A10 CraA - Acinetobacter baumannii ATCC 19606, CP059040
    BIT33_RS17375: adeJ
    BIT33_RS17370: adeI
    BIT33_RS11855: craA (MKNIQTTALNRTTLMFPLALVLFEFAVYIGNDLIQPAMLAITED
                 FGVSATWAPSSMSFYLLGGASVAWLLGPLSDRLGRKKVLLSGVLFFALCCFLILLTRQ
                 IEHFLTLRFLQGIGLSVISAVGYAAIQENFAERDAIKVMALMANISLLAPLLGPVLGA
                 FLIDYVSWHWGFVAIALLALLSWVGLKKQMPSHKVSVTKQPFSYLFDDFKKVFSNRQF
                 LGLTLALPLVGMPLMLWIALSPIILVDELKLTSVQYGLAQFPVFLGLIVGNIVLIKII
                 DRLALGKTVLIGLPIMLTGTLILILGVVWQAYLIPCLLIGMTLICFGEGISFSVLYRF
                 ALMSSEVSKGTVAAAVSMLLMTSFFAMIELVRYLYTQFHLWAFVLSAFAFIALWFTQP
                 RLALKREMQERVAQDLH) MFS transporter --> H0N29_01695
    364663 - 365770
    
    A12 AYE - Acinetobacter baumannii AYE,         CU459141
    A19 17978 - Acinetobacter baumannii ATCC 17978  CP033110
    A1S_2736: adeJ
    A1S_2735: adeI
    A1S_3146: craA
    
    mv A6WT_HQ_R1.fq.gz A6_WT_HQ_R1.fastq.gz 
    mv A6WT_HQ_R2.fq.gz A6_WT_HQ_R2.fastq.gz 
    mv A10CraA_HQ_R1.fq.gz A10_CraA_HQ_R1.fastq.gz 
    mv A10CraA_HQ_R2.fq.gz A10_CraA_HQ_R2.fastq.gz 
    mv A12AYE_HQ_R1.fq.gz A12_AYE_HQ_R1.fastq.gz 
    mv A12AYE_HQ_R2.fq.gz A12_AYE_HQ_R2.fastq.gz 
    mv A1917978_HQ_R1.fq.gz A19_17978_HQ_R1.fastq.gz 
    mv A1917978_HQ_R2.fq.gz A19_17978_HQ_R2.fastq.gz
    
    #Nucleotide Accession Prefixes
    #AP,BS                                                      DDBJ           Genome project data
    #AL,BX,CR,CT,CU,FP,FQ,FR                                    EMBL           Genome project data
    #AE,CP,CY                                                   GenBank        Genome project data
    
    #transposon detection using gubbins
    
    #TODOs: Submit A6, A12 and A19 to NCBI, A10 not, since the assembly is very bad! Could do mapping plot of A10 on A6, and three genebank-files to Tam!     
    #Wuen Ee Foong
    #Jiabin Huang
    #Heng-Keat Tam, Hengyang Medical College, University of South China
    #pA12AYE1
    
  2. prokka

    for sample in A19_17978_HQ; do
    prokka --force --outdir prokka/${sample} --cpus 2 --usegenus --genus Acinetobacter --kingdom Bacteria --species baumannii --addgenes --addmrna --prefix ${sample} --locustag ${sample} shovill/${sample}/contigs.fa -hmm /media/jhuang/Titisee/GAMOLA2/TIGRfam_db/TIGRFAMs_15.0_HMM.LIB
    done
    
    (prokka on titisee) for the remaining steps (f,f,f,f,f,t,f(variants_calling),t,t,t in bacto-0.1.json)
    
    #TODO: annotate the SNPs with antimicrobial gene database!
    #TODO: ask Qianjie die sample are directly gotten from the patient, oder did some motification on the reference genome?
    

3.1. vrap (TODO: deploy vrap on xgenes.com, it does not use the database. under (vrap) jhuang@hamm[hamburg])

    #In summary: --(optional) host --> bowtie/host (bowtie_database for host-substraction)
    #            --(optional) virus --> blast/db/virus (blast1, in nt_dbs, custom blastn database before downloaded_db!)
    #            --(always) defined in download_db.py (setting by modifying the file) and run default --> database/viral_db/viral_nucleotide (blast2, in nt_dbs)
    #                                                                                                 --> database/viral_db/viral_protein (blast3, in prot_db)
    #            --(optional) nt+nr (blast4 and blast5)

    #Under (vrap) jhuang@hamm or (bengal3_ac3) jhuang@hamburg
    #DEBUG: {path}/external_tools/Lighter-master/lighter-->/home/jhuang/miniconda3/envs/vrap/bin/lighter
    #DEBUG: {path}/external_tools/SPAdes-3.11.1-Linux/bin/spades.py-->/home/jhuang/miniconda3/envs/vrap/bin/spades.py

    #gi|1690449040|gb|CP029454.1|    Bacillus cereus strain FORC087 chromosome, complete genome
    #gi|1621560499|gb|CP039269.1|    Bacillus cereus strain MH19 chromosome, complete genome
    #gi|1783154430|gb|CP040340.1|    Bacillus cereus strain DLOU-Tangshan chromosome, complete genome
    #CP059040        3980852 10      70      71
    #CU459141        3936291 4037743 70      71
    #CP033110        3902037 8030278 70      71

    #CP059040 --> A6 and A10
    #CU459141 --> A12 
    #CP033110 --> A19

    #Command line: /home/jhuang/miniconda3/envs/bengal3_ac3/bin/spades.py -1 A6_WT_HQ_R1.fastq.gz -2 A6_WT_HQ_R2.fastq.gz --careful --trusted-contigs CP059040.fasta -o spades_A6_ATCC19606

    #CP059040_RagTag 3866625 17      3866625 3866626
    #CAP_1_length_17300      17300   3866663 17300   17301
    #CAP_2_length_928        928     3883982 928     929
    #NODE_17_length_11061_cov_141.020669     11061   3884948 11061   11062
    #NODE_19_length_1654_cov_105.893255      1654    3896046 1654    1655
    #NODE_20_length_1102_cov_153.073846      1102    3897737 1102    1103
    #NODE_21_length_1040_cov_172.078861      1040    3898876 1040    1041

    #NOTE that we need to adjust the taxid in download_db.py to download viral_nucleotide and viral_protein before running vrap/vrap.py.

    #A10 
    #Blast with 5 DBs when --virus (blast/db/virus), viral_nucleotide (download_db.rb), viral_protein (download_db.rb), nt, nr
    #Bowtie2 with 1 DB when --host (bowtie/host)
    #NOTE the fasta files given with --host and --virus should be deduplicated!
    #NOTE that in vrap_CP059040.py the trusted-contigs is used "--trusted-contigs /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/CP074710.fasta"
    vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta   -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A6
    vrap/vrap_CP059040.py -1 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A6_WT_HQ_trimmed_P_2.fastq.gz -o A6_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A12
    vrap/vrap_CU459141.py -1 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_1.fastq.gz -2 ./results_AYE/trimmed/A12_AYE_HQ_trimmed_P_2.fastq.gz -o A12_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A19 
    vrap/vrap_CP033110.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    # -- A10 LOG --
    START VrAP
    START checking blast db
    START lighter
    START estimate k-mer size
    k-mer size: 15037634
    /home/jhuang/miniconda3/envs/vrap/bin/lighter -r A10_CraA_HQ_trimmed_P_1.fastq.gz -r A10_CraA_HQ_trimmed_P_2.fastq.gz -K 20 15037634 -od A10_vrap_out/lighter -t 55
    START flash
    /home/jhuang/Tools/vrap/external_tools/FLASH-1.2.11/flash A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_1.cor.fq.gz A10_vrap_out/lighter/A10_CraA_HQ_trimmed_P_2.cor.fq.gz -o flash -d A10_vrap_out/flash -M 1000

    START bowtie2-build
    START bowtie2 pe
    #MAPPING to 1 DB: bowtie/host

    START spades    
    START cap3
    #/home/jhuang/Tools/vrap/external_tools/cap3 A10_vrap_out/spades/contigs.fasta -y 100  
    START calculating orf density
    START HMMER

    START blast
    #grep ">" A10_vrap_out/blast/custom_viral_seq.fa == "--virus Acinetobacter_baumannii.fasta" 
    #MAPPING to 5 DBs: blast/db/virus[1], viral_nucleotide (defined in the download_db.py)[2], viral_protein (defined in the download_db.py)[3], nt[4], nr[5]
    #INPUT_SEQ: 
    #(Assuming, if annotated with the previous database, will not translate to the next level). For the example below: we have no record finding in [2,1], then we have 1317-0 in blast/blastn.fa; In [3] we find 'cut -f1 blastx.csv | sort -u | wc -l = 567', then we have 1317-567=750; In [4], we 'cut -f1 blastn_nt.csv | sort -u | wc -l=710', then we have 750-710=40 in blast/blastn_nt.fa; In [5], we find 'cut -f1 blastn_nr.csv | sort -u | wc -l=39', then we remain 40-39=1 record in blastx_nr.fa.
    #grep ">" vrap_contig.fasta | wc -l   #1317 
    #grep ">" blast/blastn.fa | wc -l     #1317
    #grep ">" blast/blastx.fa | wc -l     #750
    #grep ">" blast/blastn_nt.fa | wc -l  #40
    #grep ">" blast/blastx_nr.fa | wc -l  #1
    #--
    #grep ">" vrap_contig.fasta | wc -l   #2370 
    #grep ">" blast/blastn.fa | wc -l     #2306 --> 64 contigs should belong to A. baumannii! can be annotated with --custom_virus (3 speicies of Acinetobacter baumannii) and downloaded [virus_nt_db] (2.6 G Acinetobacter baumannii)
    #grep ">" blast/blastx.fa | wc -l     #1245: 1061 contigs can be annotated with downloaded [virus_aa_db]
    #grep ">" blast/blastn_nt.fa | wc -l  #41: 1204 contigs can be annotated with nt
    #grep ">" blast/blastx_nr.fa | wc -l  #1: 40 contigs can be annotated with nr 
    #--> extract 64 contigs using 'python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_extracted_contigs.fa'

    [1_INDEX]/home/jhuang/Tools/vrap/external_tools/blast/makeblastdb -in /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/custom_viral_seq.fa -dbtype nucl -parse_seqids -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log 2>> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [2,1]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap_contig.fasta -db "/home/jhuang/Tools/vrap/database/viral_db/viral_nucleotide /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/db/virus"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [3]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn.fa -db "/home/jhuang/Tools/vrap/database/viral_db/viral_protein"  -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [4]/home/jhuang/Tools/vrap/external_tools/blast/blastn -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx.fa -db "/mnt/h1/jhuang/blast/nt"  -evalue 1e-4 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    [5]/home/jhuang/Tools/vrap/external_tools/blast/blastx -num_threads 55 -query /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastn_nt.fa -db "/mnt/h1/jhuang/blast/nr"  -evalue 1e-6 -outfmt "6 qseqid qstart qend sstart send evalue length pident sseqid stitle qcovs qcovhsp sacc slen qlen" -max_target_seqs 1 -out /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/blast/blastx_nr.csv >> /mnt/h1/jhuang/DATA/Data_Tam_Acinetobacter_baumannii/A10_vrap_out/vrap.log

    VrAP pipeline finished.
    Thank you for using VrAP!

3.2. ragtag.py ((bengal3_ac3) jhuang@hamm)

    #ragtag.py patch ragtag_output_scaffold/ragtag.scaffold.fasta CP059040.fasta

    ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta
        Acinetobacter baumannii strain ATCC 19606 plasmid unnamed, complete sequence    Acinetobacter baumannii     19653   39248   62%     0.0     99.97%  18598   CP074586.1

    python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
    ragtag.py scaffold CP059040.fasta A10_extracted_contigs.fa
    python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
    # Check the converage to determine if they are chr or plasmids!

    ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta
        Acinetobacter baumannii str. AYE plasmid p2ABAYE, complete genome   Acinetobacter baumannii AYE     16835   18307   100%    0.0     99.99%  9661    CU459138.1
        Acinetobacter baumannii str. AYE plasmid p1ABAYE, complete genome   Acinetobacter baumannii AYE     9494    10856   100%    0.0     100.00%     5644    CU459137.1
        Acinetobacter baumannii str. AYE plasmid p4ABAYE, complete genome   Acinetobacter baumannii AYE     3709    5265    100%    0.0     99.95%  2726    CU459139.1
        Acinetobacter baumannii str. AYE plasmid p3ABAYE, complete genome   Acinetobacter baumannii AYE     1.384e+05   1.838e+05   100%    0.0     100.00%     94413   CU459140.1

    ragtag.py scaffold CP033110.fasta A19_vrap_out/vrap_contig.fasta
    #using CP074710.1 in the round2 (see below)
        Acinetobacter baumannii strain ATCC 17978 plasmid unnamed1, complete sequence   Acinetobacter baumannii     2.373e+05   2.463e+05   100%    0.0     100.00%     148956  CP074709.1
        Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence   Acinetobacter baumannii     24842   50362   100%    0.0     100.00%     38118   CP074711.1
        Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence   Acinetobacter baumannii     20953   37147   100%    0.0     100.00%     20351   CP074707.1
        Acinetobacter baumannii ATCC 17978 chromosome, complete genome  Acinetobacter baumannii ATCC 17978  13245   16527   100%    0.0     100.00%     4005343     CP053098.1

3.3 vrap round2

    #A10 
    vrap/vrap_A6_chr.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_out --host Bacillus_cereus.fasta --virus=Acinetobacter_baumannii.fasta   -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

    #A19 
    vrap/vrap_CP074710.py -1 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC17978/trimmed/A19_17978_HQ_trimmed_P_2.fastq.gz -o A19_vrap_out  -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr -t 55

3.4 ragtag round2

    python3 extract_unique_sequences.py A10_vrap_out/vrap_contig.fasta A10_vrap_out/blast/blastn.fa A10_vrap_out/A10_extracted_contigs.fa
    ragtag.py scaffold A6_chr.fasta A10_vrap_out/A10_extracted_contigs.fa
    mv ragtag_output A10_ragtag_output
    python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa

    ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta
    Acinetobacter baumannii strain ATCC 17978 plasmid unnamed5, complete sequence   Acinetobacter baumannii     24842   50362   100%    0.0     100.00%     38118   CP074711.1
    Acinetobacter baumannii strain ATCC 17978 plasmid unnamed3, complete sequence   Acinetobacter baumannii     20953   37147   100%    0.0     100.00%     20351   CP074707.1

3.5 ragtag round3

    ragtag.py patch CP059040.fasta A6_vrap_out/vrap_contig.fasta
    ragtag.py scaffold CP059040.fasta A6_vrap_out/vrap_contig.fasta

    python3 extract_unique_sequences.py vrap_contig.fasta blast/blastn.fa A10_extracted_contigs.fa
    ragtag.py scaffold CP059040.fasta A10_vrap_out/A10_extracted_contigs.fa
    python3 filter_contigs.py A10_ragtag_output/ragtag.scaffold.fasta A10_ragtag_contigs_2000.fa
    # Check the converage to determine if they are chr or plasmids!
    #>CAP_12_length_2972
    #Bacillus paranthracis strain Bc006 chromosome, complete genome     Bacillus paranthracis   4691    5472    100%    0.0     99.73%  5304537     CP119629.1
    #Bacillus paranthracis strain Bc006 chromosome, complete genome     Bacillus paranthracis   9762    11557   97%     0.0     99.87%  5304537     CP119629.1
    #Bacillus paranthracis strain Bc006 chromosome, complete genome     Bacillus paranthracis   14883   1.353e+05   100%    0.0     99.98%  5304537     CP119629.1
    #CP119629.1 Submitted (10-MAR-2023) College of Animal Science, Ahau, Chagnjianxilu, Anhui 230000, China

    ragtag.py patch CU459141.fasta A12_vrap_out/vrap_contig.fasta
    ragtag.py scaffold CU459141.fasta A12_vrap_out/vrap_contig.fasta

    ragtag.py patch CP074710.fasta A19_vrap_out/vrap_contig.fasta
    ragtag.py scaffold CP074710.fasta A19_vrap_out/vrap_contig.fasta

    A10:
    CP059040_RagTag 3941335 17      60      61
    CAP_11_length_2972      2972    4007061 60      61
    CAP_1_length_17300      17300   4010103 60      61
    samtools faidx A10_ragtag_contigs_2000.fa CAP_1_length_17300 > A10_plasmid.fasta

3.6 To map reads from one isolate (isolate 2) to the genome of another isolate (isolate 1), check coverage, and generate a consensus FASTA from the alignment, with regions not covered by the mapping masked as 'N', you'll need to follow a series of steps using bioinformatics tools.

    # -- Mapping to CP059040 --
    5447038 + 0 in total (QC-passed reads + QC-failed reads)
    5447038 + 0 primary
    0 + 0 secondary
    0 + 0 supplementary
    0 + 0 duplicates
    0 + 0 primary duplicates
    3346945 + 0 mapped (61.45% : N/A)
    3346945 + 0 primary mapped (61.45% : N/A)
    4361512 + 0 paired in sequencing
    2180756 + 0 read1
    2180756 + 0 read2
    2579038 + 0 properly paired (59.13% : N/A)
    2681114 + 0 with itself and mate mapped
    21988 + 0 singletons (0.50% : N/A)
    0 + 0 with mate mapped to a different chr
    0 + 0 with mate mapped to a different chr (mapQ>=5)

    # -- Mapping to A6 --
    5447044 + 0 in total (QC-passed reads + QC-failed reads)
    5447044 + 0 primary
    0 + 0 secondary
    0 + 0 supplementary
    0 + 0 duplicates
    0 + 0 primary duplicates
    3437574 + 0 mapped (63.11% : N/A)
    3437574 + 0 primary mapped (63.11% : N/A)
    4361524 + 0 paired in sequencing
    2180762 + 0 read1
    2180762 + 0 read2
    2646686 + 0 properly paired (60.68% : N/A)
    2753738 + 0 with itself and mate mapped
    22055 + 0 singletons (0.51% : N/A)
    1484 + 0 with mate mapped to a different chr
    1390 + 0 with mate mapped to a different chr (mapQ>=5)

    #- Sort and Index the BAM File:
    #    samtools sort -o sorted.bam input.bam
    #    samtools index sorted.bam
    #- Call Variants:
    #    bcftools mpileup -Ou -f reference.fasta sorted.bam | bcftools call -mv -Oz -o calls.vcf.gz
    #    bcftools index calls.vcf.gz
    #- Generate Consensus FASTA:
    #    bcftools consensus -f reference.fasta calls.vcf.gz -o consensus.fasta

    Step 1: Align Reads to Reference Genome
        First, align the reads from isolate 2 to the genome of isolate 1. For example, using BWA:

            bwa mem reference_genome.fasta isolate2_reads.fastq > aligned.sam

        or, use vrap to generate input.bam

            #for bowtie (1 DB): --host Bacillus_cereus.fasta 
            #for blast (5 DBs): --virus=Acinetobacter_baumannii.fasta -n /mnt/h1/jhuang/blast/nt -a /mnt/h1/jhuang/blast/nr, and two default download_db.py
            #A6_chr_plasmid.fasta reads filtered, should only the contamination reads remaining! 
            vrap/vrap.py -1 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_1.fastq.gz -2 ./results_ATCC19606/trimmed/A10_CraA_HQ_trimmed_P_2.fastq.gz -o A10_vrap_host_A6 --host A6_chr_plasmid.fasta --noblast -t 55

    Step 2: Convert SAM to BAM, Sort and Index
        Convert the SAM file to BAM, sort it, and create an index:

            samtools view -Sb mapped > aligned.bam
            samtools sort aligned.bam -o sorted.bam
            samtools index sorted.bam

    Step 3: Coverage Analysis
        Check the coverage of the genome regions. Regions with no coverage will be masked later.

            #bedtools genomecov -ibam sorted.bam -bg > coverage.bed
            #BUG above: By default, Bedtools doesn't explicitly list regions with zero coverage in the output of genomecov. To get around this, you can use the -bga option with genomecov. This option ensures that regions with zero coverage are also included in the output. Here's how you can modify your command:
            bedtools genomecov -ibam sorted.bam -bga > coverage.bed

    Step 4: Create a Bed File for Masking
        Create a BED file that specifies regions with no or low coverage. Assume that any region with coverage less than a certain threshold (e.g., 1x) should be masked:

            awk '$4 < 1 {print $1 "\t" $2 "\t" $3}' coverage.bed > low_coverage.bed

    Step 5: Generate Consensus Sequence
        Generate a consensus sequence, using bcftools, and mask regions with low or no coverage:

            bcftools mpileup -Ou -f ../../CP059040.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
            bcftools index calls.vcf.gz
            bcftools consensus -f ../../CP059040.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
            #The site CP059040:4 overlaps with another variant, skipping...
            #The site CP059040:3125036 overlaps with another variant, skipping...
            #Applied 58 variants

            bcftools mpileup -Ou -f ../../A6_chr_plasmid.fasta sorted.bam | bcftools call -mv --ploidy 1 -Oz -o calls.vcf.gz
            bcftools index calls.vcf.gz
            bcftools consensus -f ../../A6_chr_plasmid.fasta -m low_coverage.bed calls.vcf.gz -o consensus.fasta
            #The site seq00000000:4 overlaps with another variant, skipping...
            #The site seq00000000:3125036 overlaps with another variant, skipping...
            #Applied 61 variants
            #--> TODO: could report the following regions are not covered in A10 comparing to A6.
            #seq00000000     364666  365765
            #seq00000000     2810907 2813323
            #seq00000000     2813552 2813779
            #seq00000000     2813923 2819663
            #seq00000000     2821839 2831921
            #seq00000000     2832750 2832753
            #seq00000000     2833977 2860354
            #seq00000000     2860505 2861780
            #seq00000000     2861931 2861951
            #seq00000000     2862102 2862234
            #seq00000000     2862385 2863434
            #seq00000000     3124939 3124943
            #seq00000000     3125009 3125010

        In this command, the -m option in bcftools consensus is used to specify a BED file for masking low-coverage regions.
  1. varicant calling using spandx

4.2-1. generate genebank in snpEff

    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/ATCC17978 
    cp ATCC17978.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/ATCC17978/genes.gbk
    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/ATCC19606 
    cp ATCC19606.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/ATCC19606/genes.gbk
    mkdir ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/AYE 
    cp AYE.gb ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/data/AYE/genes.gbk

    vim ~/anaconda3/envs/spandx/share/snpeff-4.3.1t-5/snpEff.config
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank ATCC17978      -d
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank ATCC19606      -d
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank AYE      -d

    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank ATCC17978
    #        Protein check:  ATCC17978       OK: 3644        Not found: 81   Errors: 0       Error percentage: 0.0%
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank ATCC19606
    #        Protein check:  ATCC19606       OK: 3702        Not found: 0    Errors: 0       Error percentage: 0.0%
    /home/jhuang/anaconda3/envs/spandx/bin/snpEff build -genbank AYE
    #        Protein check:  AYE     OK: 3607        Not found: 52   Errors: 0       Error percentage: 0.0%

4.3. using spandx calling variants

    #Note that we recall the wild type (isolate 150) SNPs to its own assembly to increase the precision.
    gzip A19_17978_HQ_trimmed_P_1.fastq A19_17978_HQ_trimmed_P_2.fastq
    #ln -s /home/jhuang/Tools/spandx/ spandx
    #-t
    snpEff eff -nodownload -no-downstream -no-intergenic -ud 100 -v CP040849.1 noAB_wildtype_trimmed.PASS.snps.vcf > noAB_wildtype_trimmed.PASS.snps.annotated.vcf
    #
    #               'CP040849'      2659111 Standard
    (spandx) jhuang@hamm:~/DATA/Data_Tam_Acinetobacter_baumannii/results_ATCC17978$ nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ATCC17978.fasta --annotation --database ATCC17978 -resume

    gzip A12_AYE_HQ_trimmed_P_1.fastq A12_AYE_HQ_trimmed_P_2.fastq
    ln -s /home/jhuang/Tools/spandx/ spandx
    nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref AYE.fasta --annotation --database AYE -resume

    gzip A6_WT_HQ_trimmed_P_1.fastq A6_WT_HQ_trimmed_P_2.fastq A10_CraA_HQ_trimmed_P_1.fastq A10_CraA_HQ_trimmed_P_2.fastq
    ln -s /home/jhuang/Tools/spandx/ spandx
    nextflow run spandx/main.nf --fastq "trimmed/*_P_{1,2}.fastq.gz" --ref ATCC19606.fasta --annotation  --database ATCC19606 -resume

    # DEBUG: "--mixtures --structural" does not work well!
    #pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out                 
    #pindel -f ATCC19606.fasta -T 2 -i pindel.bam.config -o pindel.out

6, filtering process of SPANDx results

    #Input file is Outputs/Phylogeny_and_annotation/All_SNPs_indels_annotated.txt (61203)
    #- Step1: filtering $6!=$7 (330)
    awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
    #- Step2: restricted the first 7 columns (240)
    cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
    #- Step3: filtering examples as "G/C" (1450)
    grep -v "/" f1_7 > f1_7_
    #- Step4: filtering examples as ".       A" (170)
    grep -v "\." f1_7_ > f1_7__
    #- Step5: filtering examples as "T       *" (162), since we want to have only SNPs differing between BK20399 and GE3138.
    grep -v "*" f1_7__ > f1_7___
    #- Step6: only SNPs (50).
    grep -v "INDEL" f1_7___ > f1_7____
    #- Step7: get the complete rows with the positions
    cut -d$'\t' -f2-2 f1_7____ > f2
    replace "\n" with " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
    grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
    grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    grep "46936" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    ....
    #- Step8: Missense mutations ()
    grep "missense_variant" All_SNPs_annotated.txt

7, indel calling using freebayes from merged bam

    mkdir freebayes
    cd freebayes
    #run picard unter java17
    (java17) picard-tools AddOrReplaceReadGroups I=../Outputs/bams/A12_AYE_HQ_trimmed.dedup.bam O=A12_AYE_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=A12_AYE RGSM=A12_AYE RGLB=standard RGPU=A12_AYE VALIDATION_STRINGENCY=LENIENT
    #(java17) picard-tools AddOrReplaceReadGroups I=../Outputs/bams/148_trimmed.dedup.bam O=148_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=148 RGSM=148 RGLB=standard RGPU=148 VALIDATION_STRINGENCY=LENIENT
    #(java17) picard-tools AddOrReplaceReadGroups I=../Outputs/bams/149_trimmed.dedup.bam O=149_readgroup-added.bam SORT_ORDER=coordinate CREATE_INDEX=true RGPL=illumina RGID=149 RGSM=149 RGLB=standard RGPU=149 VALIDATION_STRINGENCY=LENIENT
    #(java17) picard-tools MergeSamFiles INPUT=150_readgroup-added.bam INPUT=148_readgroup-added.bam INPUT=149_readgroup-added.bam OUTPUT=merged_picard.bam
    #(java17) picard-tools CreateSequenceDictionary R=wildtype_150.fasta O=wildtype_150.dict

    #Note that the reference is 'wildtype_150'.
    # Direkt SNP calling is not suggested. It is better if we take a realigning step before SNP and Indel-calling.
    #conda install -c bioconda gatk4 gatk
    #gatk-register /home/jhuang/Tools/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar
    #need wildtype_150.fasta.fai and wildtype_150.dict

    #under the bengal3_ac3 environment
    (bengal3_ac3) samtools index merged_picard.bam 
    (bengal3_ac3) gatk -T RealignerTargetCreator -I merged_picard.bam  -R /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP/wildtype_150.fasta -o realigned.merged.bam.list
    (bengal3_ac3) gatk -T IndelRealigner -I merged_picard.bam -R /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP/wildtype_150.fasta -targetIntervals realigned.merged.bam.list -o merged_realigned.bam

    freebayes --fasta-reference /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP_PUBLISHING/wildtype_150.fasta -b merged_realigned.bam -p 1 -i -X -u | /home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter -f 'QUAL > 30' > merged.realigned.snps_filtered_freebayes.vcf

    freebayes --fasta-reference /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP_PUBLISHING/wildtype_150.fasta -b merged_realigned.bam -p 1 -I -X -u | /home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter -f 'QUAL > 30' > merged.realigned.indels_filtered_freebayes.vcf
    #BETTER with realigned bam (see commands above): freebayes --fasta-reference /media/jhuang/Elements1/Data_Holger_Klebsiella_pneumoniae_SNP_PUBLISHING/wildtype_150.fasta -b merged_picard.bam   -p 1 -I -X -u | /home/jhuang/Tools/freebayes_git/freebayes/vcflib/bin/vcffilter -f 'QUAL > 30' > merged.indels_filtered_freebayes.vcf

    #freebayes -f {params.reference} \
    #  --ploidy 1 \
    #  --min-coverage {params.mincov} \
    #  --min-mapping-quality {params.mapqual} \
    #  --min-base-quality {params.minbasequal} \
    #  {input.forward} {input.reverse} > {output.vcf}

    #To call structural variants (indels) and single nucleotide polymorphisms (SNPs) using FreeBayes, you'll need to follow a series of steps. FreeBayes is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs, indels, MNPs (multi-nucleotide polymorphisms), and complex events (composite insertion and substitution events) smaller than the length of a read.

    #FreeBayes is primarily designed for calling small-scale variants like SNPs and indels (insertions and deletions) but is not typically used for large-scale structural variant detection (like large insertions, deletions, inversions, or translocations). Structural variants are usually larger in size, often exceeding the length of a single read, which is beyond the scope of what FreeBayes is optimized to handle.
    #For calling structural variants, you would generally use other tools that are specifically designed for this purpose, such as:
        Delly: For large deletions, tandem duplications, inversions, and translocations.
        LUMPY: A probabilistic framework for structural variant discovery.
        Manta: Designed for efficient calling of structural variants in cancer and germline studies.
        CNVnator: For discovering copy number variants.
    #If you're specifically looking to use FreeBayes for larger indels that might be on the edge of what is considered a small-scale variant, you can adjust the parameters to be more permissive with respect to variant size. However, this is not typically recommended for true structural variants.
    For small indels and SNPs, FreeBayes is quite effective, and you would follow the general procedure of aligning your reads, processing the BAM files, and then running FreeBayes with appropriate parameters set for your study's requirements. But again, for structural variant detection, other tools would be more suitable.

8, merge results from two variant calling methods!

    #-- post-processing --
    awk '{if($6!=$7) print}' < All_SNPs_indels_annotated.txt > All_SNPs_indels_annotated_.txt
    cut -d$'\t' -f1-7 All_SNPs_indels_annotated_.txt > f1_7
    grep -v "/" f1_7 > f1_7_
    grep -v "\." f1_7_ > f1_7__
    grep -v "*" f1_7__ > f1_7___

    grep -v "INDEL" f1_7___ > f1_7_SNPs
    cut -d$'\t' -f2-2 f1_7_SNPs > f1_7_SNPs_f2
    \n --> " All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt\ngrep "
    #grep "CHROM" All_SNPs_indels_annotated_.txt > All_SNPs_annotated.txt
    #grep "46882" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt
    #grep "46892" All_SNPs_indels_annotated_.txt >> All_SNPs_annotated.txt

    grep -v "SNP" f1_7___ > f1_7_indels
    cut -d$'\t' -f2-2 f1_7_indels > f1_7_indels_f2
    \n --> " All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt\ngrep "
    #grep "CHROM" All_SNPs_indels_annotated_.txt > All_indels_annotated.txt
    #grep "49848" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt
    #grep "53872" All_SNPs_indels_annotated_.txt >> All_indels_annotated.txt

9 (optional), PubMLST, ResFinder or RGI calling https://cge.cbs.dtu.dk/services/ https://cge.cbs.dtu.dk/services/MLST/ https://pubmlst.org/bigsdb?db=pubmlst_pacnes_seqdef&page=batchProfiles&scheme_id=3 https://cge.food.dtu.dk/services/ResFinder/ https://card.mcmaster.ca/home https://www.pseudomonas.com/ https://www.pseudomonas.com/amr/list https://card.mcmaster.ca/analyze/rgi The antimicrobial resistance genes were predicted by the tool RGI 6.0.0 which utilizes the database CARD 3.2.5 (PMID: 31665441). !!!!#check the difference of SNPs in the two files, and then look for the SNPs in my tables comparing the two isolates!!!!

    cut -d$'\t' -f9-21 BK20399_contigs.fa.txt > f9_21
    cut -d$'\t' -f2-6 BK20399_contigs.fa.txt > f2_6
    #Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
    paste f9_21 f2_6 > BK20399_.txt
    #grep "CARD_Protein_Sequence" BK20399_.txt > header
    grep -v "CARD_Protein_Sequence" BK20399_.txt > BK20399__.txt
    sort BK20399__.txt > BK20399_sorted.txt

    cut -d$'\t' -f9-21 GE3138_contigs.fa.txt > f9_21
    cut -d$'\t' -f2-6 GE3138_contigs.fa.txt > f2_6
    paste f9_21 f2_6 > GE3138_.txt
    grep -v "CARD_Protein_Sequence" GE3138_.txt > GE3138__.txt
    sort GE3138__.txt > GE3138_sorted.txt

    cut -d$'\t' -f1-6 BK20399_sorted.txt > BK20399_f1_6.txt  
    cut -d$'\t' -f1-6 GE3138_sorted.txt > GE3138_f1_6.txt
    diff BK20399_f1_6.txt GE3138_f1_6.txt
    #-->< APH(3'')-Ib   99.63   3002639 protein homolog model   n/a n/a

    cat header BK20399_sorted.txt > BK20399.txt
    cat header GE3138_sorted.txt > GE3138.txt

    ~/Tools/csv2xls-0.4/csv_to_xls.py BK20399.txt GE3138.txt -d$'\t' -o ARG_calling.xls

    #---- another example ----
    cut -d$'\t' -f9-21 AW27149_before.txt > f9_21
    cut -d$'\t' -f2-6 AW27149_before.txt > f2_6
    #Best_Hit_ARO --> Antibiotic Resistance Ontology (ARO) Term
    paste f9_21 f2_6 > AW27149_before_.txt
    grep "CARD_Protein_Sequence" AW27149_before_.txt > header
    grep -v "CARD_Protein_Sequence" AW27149_before_.txt > AW27149_before__.txt
    sort AW27149_before__.txt > AW27149_before_sorted.txt

    cut -d$'\t' -f9-21 AW48641_after.txt > f9_21
    cut -d$'\t' -f2-6 AW48641_after.txt > f2_6
    paste f9_21 f2_6 > AW48641_after_.txt
    grep -v "CARD_Protein_Sequence" AW48641_after_.txt > AW48641_after__.txt
    sort AW48641_after__.txt > AW48641_after_sorted.txt

    cut -d$'\t' -f1-6 AW27149_before_sorted.txt > AW27149_before_f1_6.txt  
    cut -d$'\t' -f1-6 AW48641_after_sorted.txt > AW48641_after_f1_6.txt
    diff AW27149_before_f1_6.txt AW48641_after_f1_6.txt
    #48c48
    #< Pseudomonas aeruginosa CpxR   100.0   3004054 protein homolog model   n/a     n/a
    #---
    #> Pseudomonas aeruginosa CpxR   99.56   3004054 protein homolog model   n/a     n/a

    cat header AW27149_before_sorted.txt > ../RGI_calling_AW27149_before.txt
    cat header AW48641_after_sorted.txt > ../RGI_calling_AW48641_after.txt

    #mlst add header "isolate   species ST  acsA    aroE    guaA    mutL    nuoD    ppsA    trpE"
    mv 
    ~/Tools/csv2xls-0.4/csv_to_xls.py All_SNPs_annotated.csv RGI_calling_AW27149_before.txt RGI_calling_AW48641_after.txt mlst.txt -d$'\t' -o SNP_ARG_calling.xls

    #Find the common SNP between SNP calling and RGI calling, since the SNP calling has bad annotation, marked yellow.

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum