Workflow using QIIME2 for Data_Marius_16S_2025

gene_x 0 like s 8 view s

Tags: pipeline

  1. Install and test qiime2-docker

    #Cannot run under QIIME1, switch to QIIME2: pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i test.fna -o clustering_test/ -p clustering_params.txt --parallel --verbose
    
    docker pull quay.io/qiime2/core:2023.9
    
    docker run -it --rm \
    -v /mnt/md1/DATA/Data_Marius_16S_2025:/data \
    -v /home/jhuang/REFs:/home/jhuang/REFs \
    quay.io/qiime2/core:2023.9 bash
    cd /data
    
  2. Import the fastq-files to paired-end-demux.qza

    #https://docs.qiime2.org/2018.8/tutorials/importing/
    
    qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path pe-33-manifest --output-path paired-end-demux.qza --input-format PairedEndFastqManifestPhred33
    #--> 1095204304 Mai 27 15:11 paired-end-demux.qza
    
    qiime demux summarize \
    --i-data paired-end-demux.qza \
    --o-visualization demux_pe.qzv
    
    #https://view.qiime2.org
    #qiime tools view demux_pe.qzv
    
  3. Optimizing the parameters trunc-len-f and trunc-len-r and denoising with DADA2: optimized parameters is f240_r240

    #Your amplicon (V3–V4 region) is ~464 bp, so you need ≥20–30 bp overlap
    #464-38=426; 440 is the longst +12 nt for overlapping=we need 452 nt!
    
    #Optimize the parameters --p-trunc-len-f and --p-trunc-len-r
    ./dada2_batch_test.sh
    
            #!/bin/bash
    
            # Set your base inputs
            INPUT=paired-end-demux.qza
            TRIM_LEFT_F=17
            TRIM_LEFT_R=21
    
            # Output base
            OUTPUT_DIR=dada2_tests
            mkdir -p $OUTPUT_DIR
    
            # Loop over trunc-len-f and trunc-len-r combinations
            # Forward: from 220 to 240
            # Reverse: from 210 to 230
            i=1
            for f in 240 235 230 225; do
            for r in 225 220 215; do
                OUT=test_${i}_f${f}_r${r}
                echo "Running: $OUT"
                mkdir -p $OUTPUT_DIR/$OUT
    
                qiime dada2 denoise-paired \
                --i-demultiplexed-seqs $INPUT \
                --p-trim-left-f $TRIM_LEFT_F \
                --p-trim-left-r $TRIM_LEFT_R \
                --p-max-ee-f 3 --p-max-ee-r 5 \
                --p-trunc-len-f $f \
                --p-trunc-len-r $r \
                --p-n-threads 32 \
                --o-table $OUTPUT_DIR/$OUT/table.qza \
                --o-representative-sequences $OUTPUT_DIR/$OUT/rep-seqs.qza \
                --o-denoising-stats $OUTPUT_DIR/$OUT/denoising-stats.qza \
                --verbose > $OUTPUT_DIR/$OUT/log.txt 2>&1
    
                ((i++))
            done
            done
    
    for f in dada2_tests2/test_*/denoising-stats.qza; do
    qiime metadata tabulate \
        --m-input-file $f \
        --o-visualization ${f%.qza}.qzv
    done
    
    #pandaseq.out: grep ">" A1_R1.fastq.gz_merged.fasta | wc -l #8229;  grep ">" A10_R1.fastq.gz_merged.fasta | wc -l #9165
    
    # The best choice is f251_r251, since the first 17 and 21 bases with bad quality are anyway removed!
    #f251_r251: sample-A1   18299   10989   60.05   10609   7129    38.96   6535    35.71;  sample-A10  18736   12249   65.38   11778   7444    39.73   6978    37.24
    #f251_r250: sample-A1   18299   11855   64.78   11435   7431    40.61   6823    37.29;  sample-A10  18736   13092   69.88   12590   7714    41.17   7206    38.46
    #f251_r245: sample-A1   18299   12642   69.09   12180   7860    42.95   7193    39.31;  sample-A10  18736   13981   74.62   13457   8218    43.86   7649    40.83
    #f251_r240: sample-A1   18299   12678   69.28   12214   8060    44.05   7387    40.37;  sample-A10  18736   14018   74.82   13498   8412    44.9    7758    41.41
    
    #f250_r240: sample-A1   18299   13705   74.89   13191   8796    48.07   7984    43.63
    #f250_r235: sample-A1   18299   13716   74.95   13198   8793    48.05   7969    43.55
    #f250_r230: sample-A1   18299   13739   75.08   13217   9023    49.31   8113    44.34;  sample-A10  18736   14838   79.2    14159   8895    47.48   8101    43.24
    #f245_r240: sample-A1   18299   14513   79.31   14019   9472    51.76   8739    47.76;  sample-A10  18736   15609   83.31   15102   9605    51.26   8880    47.4
    #f245_r235: sample-A1   18299   14524   79.37   14026   9485    51.83   8746    47.79;  sample-A10  18736   15634   83.44   15127   9685    51.69   8869    47.34
    #f245_r230: sample-A1   18299   14547   79.5    14045   8845    48.34   8058    44.04;  sample-A10  18736   15664   83.6    15156   8812    47.03   7999    42.69
    #f240_r240: sample-A1   18299   14647   80.04   14164   9869    53.93   8932    48.81;  sample-A10  18736   15728   83.95   15213   10488   55.98   9081    48.47 *
    #f240_r235: sample-A1   18299   14661   80.12   14172   9125    49.87   8194    44.78;  sample-A10  18736   15755   84.09   15242   9579    51.13   8105    43.26
    #f240_r230: sample-A1   18299   14686   80.26   14191   4952    27.06   4666    25.5;   sample-A10  18736   15785   84.25   15267   3489    18.62   3341    17.83
    
    #f240_r225: sample-A1   18299   14701   80.34   14206   4575    25  4297    23.48
    #f240_r220: sample-A1   18299   14720   80.44   14223   4588    25.07   4310    23.55
    #f240_r225: sample-A1   18299   14747   80.59   14250   3976    21.73   3758    20.54
    #f230_r220: sample-A1   18299   14972   81.82   14514   3   0.02    3   0.02
    
    #The output of the optimized denoising: (*) dada2_tests2/test_7_f240_r240/table.qza and dada2_tests2/test_7_f240_r240/rep-seqs.qza.
    
  4. Visualize outputs

    qiime feature-table summarize \
    --i-table dada2_tests2/test_7_f240_r240/table.qza \
    --o-visualization table.qzv \
    --m-sample-metadata-file qiime2_metadata.tsv
    
    #Table summary
    #Metric Sample
    #Number of samples  137
    #Number of features 3,039
    #Total frequency    1,641,484
    #
    #Frequency per sample
    #Minimum frequency  413.0
    #1st quartile   10,319.0
    #Median frequency   11,530.0
    #3rd quartile   13,146.0
    #Maximum frequency  40,022.0
    #Mean frequency 11,981.635036496351
    #
    #Frequency per feature
    #Minimum frequency  1.0
    #1st quartile   3.0
    #Median frequency   8.0
    #3rd quartile   95.5
    #Maximum frequency  56,472.0
    #Mean frequency 540.1395195788089
    
    #qiime tools peek table.qza
    #qiime tools peek qiime2_metadata.tsv
    
    qiime feature-table tabulate-seqs \
    --i-data dada2_tests2/test_7_f240_r240/rep-seqs.qza \
    --o-visualization rep-seqs.qzv
    
    qiime metadata tabulate \
    --m-input-file dada2_tests2/test_7_f240_r240/denoising-stats.qza \
    --o-visualization denoising-stats.qzv
    
  5. Import reference sequences and taxonomy (SILVA 132)

    qiime tools import \
    --type 'FeatureData[Sequence]' \
    --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna \
    --output-path silva_132_99_otus.qza \
    --input-format DNAFASTAFormat
    
    qiime tools import \
    --type 'FeatureData[Taxonomy]' \
    --input-format HeaderlessTSVTaxonomyFormat \
    --input-path /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt \
    --output-path silva_132_99_taxonomy.qza
    
  6. Assign taxonomy

    qiime feature-classifier classify-consensus-vsearch \
    --i-query dada2_tests2/test_7_f240_r240/rep-seqs.qza \
    --i-reference-reads silva_132_99_otus.qza \
    --i-reference-taxonomy silva_132_99_taxonomy.qza \
    --p-perc-identity 0.97 \
    --p-threads 64 \
    --o-classification taxonomy.qza \
    --o-search-results search-results.qza
    
  7. Visualize taxonomy

    qiime taxa barplot \
    --i-table dada2_tests2/test_7_f240_r240/table.qza \
    --i-taxonomy taxonomy.qza \
    --m-metadata-file qiime2_metadata.tsv \
    --o-visualization taxa-bar-plots.qzv
    
  8. Build phylogenetic tree

    qiime alignment mafft \
    --i-sequences dada2_tests2/test_7_f240_r240/rep-seqs.qza \
    --o-alignment aligned-rep-seqs.qza
    
    qiime alignment mask \
    --i-alignment aligned-rep-seqs.qza \
    --o-masked-alignment masked-aligned-rep-seqs.qza
    
    qiime phylogeny fasttree \
    --i-alignment masked-aligned-rep-seqs.qza \
    --o-tree unrooted-tree.qza
    
    (*) qiime phylogeny midpoint-root \
    --i-tree unrooted-tree.qza \
    --o-rooted-tree rooted-tree.qza
    
  9. Core diversity analysis

    #The -e 6389 flag sets the even sampling depth (rarefaction depth) to 6,389 reads for diversity analyses.
    #All samples will be rarefied to 4,753 reads.
    #Samples with fewer reads are excluded.
    qiime diversity core-metrics-phylogenetic \
    --i-phylogeny rooted-tree.qza \
    --i-table dada2_tests2/test_7_f240_r240/table.qza \
    --p-sampling-depth 6389 \
    --m-metadata-file qiime2_metadata.tsv \
    --output-dir core_metrics_results
    
    qiime diversity alpha \
    --i-table table.qza \
    --p-metric chao1 \
    --o-alpha-diversity core_metrics_results/chao1_vector.qza
    
    qiime tools export --input-path core_metrics_results/shannon_vector.qza --output-path exported_alpha/shannon
    qiime tools export --input-path core_metrics_results/faith_pd_vector.qza --output-path exported_alpha/faith_pd
    qiime tools export --input-path core_metrics_results/observed_features_vector.qza --output-path exported_alpha/observed_features
    qiime tools export --input-path core_metrics_results/chao1_vector.qza --output-path exported_alpha/chao1
    
    qiime tools export \
    --input-path core_metrics_results/unweighted_unifrac_distance_matrix.qza \
    --output-path exported_unweighted_unifrac
    qiime tools export \
    --input-path core_metrics_results/weighted_unifrac_distance_matrix.qza \
    --output-path exported_weighted_unifrac
    
    qiime diversity beta-group-significance \
    --i-distance-matrix core_metrics_results/weighted_unifrac_distance_matrix.qza \
    --m-metadata-file qiime2_metadata.tsv \
    --m-metadata-column Group \
    --p-pairwise \
    --p-method permanova \
    --o-visualization beta_group_significance.qzv
    
    qiime tools export \
    --input-path beta_group_significance.qzv \
    --output-path exported_beta_group
    
    #✅ Group 1 / Group 2 — The pairwise comparisons.
    #✅ Sample size — Number of samples used in the test.
    #✅ Permutations — Number of permutations in the PERMANOVA test.
    #✅ pseudo-F — The test statistic from PERMANOVA.
    #✅ p-value — The unadjusted p-value.
    #✅ q-value — The adjusted p-value (Bonferroni in QIIME2).
    #The q-value column (also sometimes called p-adj) is the multiple-testing corrected p-value.
    #👉 q < 0.05 means the difference is statistically significant between those two groups, even after correction.
    #“There is a significant difference in community composition between Group 1 and Group 2 (p=0.002).”
    
    #The --p-sampling-depth 6389 parameter is directly equivalent to QIIME 1’s -e 6389!
    #The QIIME 2 command will compute:
    #✅ Alpha diversity metrics (Observed OTUs, Shannon, Faith PD, Evenness)
    #✅ Beta diversity distance matrices (UniFrac, Bray-Curtis)
    #✅ PCoA plots
    #✅ Excludes samples with fewer than 6,389 reads.
    
    #📦 Output Folders and Files
    #Output Description
    #table.qzv  Visual of feature table (sample counts)
    #rep-seqs.qzv   Sequences per ASV
    #denoising-stats.qzv    DADA2 read tracking
    #taxonomy.qza/.qzv  Taxonomic classification of ASVs
    #taxa-bar-plots.qzv Interactive bar plots
    #core_metrics_results/  Alpha/beta diversity metrics + PCoA plots
    
  10. Prepare three files feeding to Phyloseq.Rmd: table.qza (see above with ), rooted-tree.qza (see above with ), qiime2_metadata_for_qza_to_phyloseq.tsv edited from qiime2_metadata.tsv.

    # Rarefying can be performed here, or in Phyloseq.Rmd (default), therefore, we don't need this step any more.
    qiime feature-table summarize \
    --i-table core_metrics_results/rarefied_table.qza \
    --o-visualization rarefied_table.qzv \
    --m-sample-metadata-file qiime2_metadata.tsv
    
    #Table summary
    #Metric Sample
    #Number of samples  136
    #Number of features 2,781
    #Total frequency    868,904
    
    # In QIIME2, we need table.qza, not biom-file, therefore, we don't need this step any more.
    qiime tools export \
    --input-path core_metrics_results/rarefied_table.qza \
    --output-path exported_rarefied_table
    #-->
    exported_rarefied_table/feature-table.biom
    
    biom convert \
    -i exported_rarefied_table/feature-table.biom \
    -o exported_rarefied_table/feature-table.tsv \
    --to-tsv
    
    #✅ Old QIIME 1 table with GenBank IDs (like EF603722.1.1487) as feature labels.
    #✅ QIIME 2 table where feature IDs are hashes (like 0b438323a296b5f2ce2c8bbe3949ee8d).
    
    # Visulaize the taxonomy.qza
    qiime tools export \
    --input-path taxonomy.qza \
    --output-path exported-taxonomy
    
    #Feature ID    Taxon                               Confidence
    #0b4383...     k__Bacteria; p__Proteobacteria...   0.98
    #dfa833...     k__Bacteria; p__Firmicutes...       0.87
    #...
    
    # ---- I used the following to generate two file for feeding in the Phyloseq.Rmd ----
    
    #-1- exported_table/feature-table.biom corresesponds to table_even6389.biom in QIIME1, but in QIIME2, we don't need biom-file, instead of table.qza.
    
    qiime tools export \
    --input-path dada2_tests2/test_7_f240_r240/table.qza \
    --output-path exported_table
    #--> exported_table/feature-table.biom
    
    #-2- exported-tree/tree.nwk corresesponds to rep_set.tre in QIIME1
    
    qiime tools export \
    --input-path rooted-tree.qza \
    --output-path exported-tree
    #--> exported-tree/tree.nwk
    
    # ---- The code in Phyloseq.Rmd ----
    
    #install.packages("remotes")
    #remotes::install_github("jbisanz/qiime2R")
    #"core_metrics_results/rarefied_table.qza", rarefying performed in the code, therefore import the raw table.
    library(qiime2R)
    ps.ng.tax <- qza_to_phyloseq(
        features =  "dada2_tests2/test_7_f240_r240/table.qza",
        tree = "rooted-tree.qza",
        metadata = "qiime2_metadata_for_qza_to_phyloseq.tsv"
    )
    # or
    #biom convert \
    #      -i ./exported_table/feature-table.biom \
    #      -o ./exported_table/feature-table-v1.biom \
    #      --to-json
    #ps.ng.tax <- import_biom("./exported_table/feature-table-v1.biom", treefilename="./exported-tree/tree.nwk")
    
    #Note that the alpha- and beta-diversity-files needed in Phyloseq.Rmd has been prepared in the step 9.
    
  11. Figures generated by Phyloseq.Rmd and MicrobiotaProcess_*.R

    The following files can be found under server.

    ./Phyloseq.Rmd (Result Phyloseq.html)
    ./MicrobiotaProcess_cluster1_Group9-11_vs_cluster2_Group12-14_orig.R
    ./MicrobiotaProcess_Group1_vs_Group2.R
    ./MicrobiotaProcess_Group3_vs_Group4.R
    ./MicrobiotaProcess_PCA_Group1-4.R
    ./MicrobiotaProcess_PCA_Group9-14.R
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum