Workflow using QIIME2 for Data_Marius_16S_2025

gene_x 0 like s 150 view s

Tags: pipeline

Install and test qiime2-docker

#Cannot run under QIIME1, switch to QIIME2: pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i test.fna -o clustering_test/ -p clustering_params.txt --parallel --verbose

docker pull quay.io/qiime2/core:2023.9

docker run -it --rm \
-v /mnt/md1/DATA/Data_Marius_16S_2025:/data \
-v /home/jhuang/REFs:/home/jhuang/REFs \
quay.io/qiime2/core:2023.9 bash
cd /data

Import the fastq-files to paired-end-demux.qza

#https://docs.qiime2.org/2018.8/tutorials/importing/

qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path pe-33-manifest --output-path paired-end-demux.qza --input-format PairedEndFastqManifestPhred33
#--> 1095204304 Mai 27 15:11 paired-end-demux.qza

qiime demux summarize \
--i-data paired-end-demux.qza \
--o-visualization demux_pe.qzv

#https://view.qiime2.org
#qiime tools view demux_pe.qzv

Optimizing the parameters trunc-len-f and trunc-len-r and denoising with DADA2: optimized parameters is f240_r240

#Your amplicon (V3–V4 region) is ~464 bp, so you need ≥20–30 bp overlap
#464-38=426; 440 is the longst +12 nt for overlapping=we need 452 nt!

#Optimize the parameters --p-trunc-len-f and --p-trunc-len-r
./dada2_batch_test.sh

        #!/bin/bash

        # Set your base inputs
        INPUT=paired-end-demux.qza
        TRIM_LEFT_F=17
        TRIM_LEFT_R=21

        # Output base
        OUTPUT_DIR=dada2_tests
        mkdir -p $OUTPUT_DIR

        # Loop over trunc-len-f and trunc-len-r combinations
        # Forward: from 220 to 240
        # Reverse: from 210 to 230
        i=1
        for f in 240 235 230 225; do
        for r in 225 220 215; do
            OUT=test_${i}_f${f}_r${r}
            echo "Running: $OUT"
            mkdir -p $OUTPUT_DIR/$OUT

            qiime dada2 denoise-paired \
            --i-demultiplexed-seqs $INPUT \
            --p-trim-left-f $TRIM_LEFT_F \
            --p-trim-left-r $TRIM_LEFT_R \
            --p-max-ee-f 3 --p-max-ee-r 5 \
            --p-trunc-len-f $f \
            --p-trunc-len-r $r \
            --p-n-threads 32 \
            --o-table $OUTPUT_DIR/$OUT/table.qza \
            --o-representative-sequences $OUTPUT_DIR/$OUT/rep-seqs.qza \
            --o-denoising-stats $OUTPUT_DIR/$OUT/denoising-stats.qza \
            --verbose > $OUTPUT_DIR/$OUT/log.txt 2>&1

            ((i++))
        done
        done

for f in dada2_tests2/test_*/denoising-stats.qza; do
qiime metadata tabulate \
    --m-input-file $f \
    --o-visualization ${f%.qza}.qzv
done

#pandaseq.out: grep ">" A1_R1.fastq.gz_merged.fasta | wc -l #8229;  grep ">" A10_R1.fastq.gz_merged.fasta | wc -l #9165

# The best choice is f251_r251, since the first 17 and 21 bases with bad quality are anyway removed!
#f251_r251: sample-A1   18299   10989   60.05   10609   7129    38.96   6535    35.71;  sample-A10  18736   12249   65.38   11778   7444    39.73   6978    37.24
#f251_r250: sample-A1   18299   11855   64.78   11435   7431    40.61   6823    37.29;  sample-A10  18736   13092   69.88   12590   7714    41.17   7206    38.46
#f251_r245: sample-A1   18299   12642   69.09   12180   7860    42.95   7193    39.31;  sample-A10  18736   13981   74.62   13457   8218    43.86   7649    40.83
#f251_r240: sample-A1   18299   12678   69.28   12214   8060    44.05   7387    40.37;  sample-A10  18736   14018   74.82   13498   8412    44.9    7758    41.41

#f250_r240: sample-A1   18299   13705   74.89   13191   8796    48.07   7984    43.63
#f250_r235: sample-A1   18299   13716   74.95   13198   8793    48.05   7969    43.55
#f250_r230: sample-A1   18299   13739   75.08   13217   9023    49.31   8113    44.34;  sample-A10  18736   14838   79.2    14159   8895    47.48   8101    43.24
#f245_r240: sample-A1   18299   14513   79.31   14019   9472    51.76   8739    47.76;  sample-A10  18736   15609   83.31   15102   9605    51.26   8880    47.4
#f245_r235: sample-A1   18299   14524   79.37   14026   9485    51.83   8746    47.79;  sample-A10  18736   15634   83.44   15127   9685    51.69   8869    47.34
#f245_r230: sample-A1   18299   14547   79.5    14045   8845    48.34   8058    44.04;  sample-A10  18736   15664   83.6    15156   8812    47.03   7999    42.69
#f240_r240: sample-A1   18299   14647   80.04   14164   9869    53.93   8932    48.81;  sample-A10  18736   15728   83.95   15213   10488   55.98   9081    48.47 *
#f240_r235: sample-A1   18299   14661   80.12   14172   9125    49.87   8194    44.78;  sample-A10  18736   15755   84.09   15242   9579    51.13   8105    43.26
#f240_r230: sample-A1   18299   14686   80.26   14191   4952    27.06   4666    25.5;   sample-A10  18736   15785   84.25   15267   3489    18.62   3341    17.83

#f240_r225: sample-A1   18299   14701   80.34   14206   4575    25  4297    23.48
#f240_r220: sample-A1   18299   14720   80.44   14223   4588    25.07   4310    23.55
#f240_r225: sample-A1   18299   14747   80.59   14250   3976    21.73   3758    20.54
#f230_r220: sample-A1   18299   14972   81.82   14514   3   0.02    3   0.02

#The output of the optimized denoising: (*) dada2_tests2/test_7_f240_r240/table.qza and dada2_tests2/test_7_f240_r240/rep-seqs.qza.

Visualize outputs

qiime feature-table summarize \
--i-table dada2_tests2/test_7_f240_r240/table.qza \
--o-visualization table.qzv \
--m-sample-metadata-file qiime2_metadata.tsv

#Table summary
#Metric Sample
#Number of samples  137
#Number of features 3,039
#Total frequency    1,641,484
#
#Frequency per sample
#Minimum frequency  413.0
#1st quartile   10,319.0
#Median frequency   11,530.0
#3rd quartile   13,146.0
#Maximum frequency  40,022.0
#Mean frequency 11,981.635036496351
#
#Frequency per feature
#Minimum frequency  1.0
#1st quartile   3.0
#Median frequency   8.0
#3rd quartile   95.5
#Maximum frequency  56,472.0
#Mean frequency 540.1395195788089

#qiime tools peek table.qza
#qiime tools peek qiime2_metadata.tsv

qiime feature-table tabulate-seqs \
--i-data dada2_tests2/test_7_f240_r240/rep-seqs.qza \
--o-visualization rep-seqs.qzv

qiime metadata tabulate \
--m-input-file dada2_tests2/test_7_f240_r240/denoising-stats.qza \
--o-visualization denoising-stats.qzv

Import reference sequences and taxonomy (SILVA 132)

qiime tools import \
--type 'FeatureData[Sequence]' \
--input-path /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna \
--output-path silva_132_99_otus.qza \
--input-format DNAFASTAFormat

qiime tools import \
--type 'FeatureData[Taxonomy]' \
--input-format HeaderlessTSVTaxonomyFormat \
--input-path /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt \
--output-path silva_132_99_taxonomy.qza

Assign taxonomy

qiime feature-classifier classify-consensus-vsearch \
--i-query dada2_tests2/test_7_f240_r240/rep-seqs.qza \
--i-reference-reads silva_132_99_otus.qza \
--i-reference-taxonomy silva_132_99_taxonomy.qza \
--p-perc-identity 0.97 \
--p-threads 64 \
--o-classification taxonomy.qza \
--o-search-results search-results.qza

Visualize taxonomy

qiime taxa barplot \
--i-table dada2_tests2/test_7_f240_r240/table.qza \
--i-taxonomy taxonomy.qza \
--m-metadata-file qiime2_metadata.tsv \
--o-visualization taxa-bar-plots.qzv

Build phylogenetic tree

qiime alignment mafft \
--i-sequences dada2_tests2/test_7_f240_r240/rep-seqs.qza \
--o-alignment aligned-rep-seqs.qza

qiime alignment mask \
--i-alignment aligned-rep-seqs.qza \
--o-masked-alignment masked-aligned-rep-seqs.qza

qiime phylogeny fasttree \
--i-alignment masked-aligned-rep-seqs.qza \
--o-tree unrooted-tree.qza

(*) qiime phylogeny midpoint-root \
--i-tree unrooted-tree.qza \
--o-rooted-tree rooted-tree.qza

Core diversity analysis

#The -e 6389 flag sets the even sampling depth (rarefaction depth) to 6,389 reads for diversity analyses.
#All samples will be rarefied to 4,753 reads.
#Samples with fewer reads are excluded.
qiime diversity core-metrics-phylogenetic \
--i-phylogeny rooted-tree.qza \
--i-table dada2_tests2/test_7_f240_r240/table.qza \
--p-sampling-depth 6389 \
--m-metadata-file qiime2_metadata.tsv \
--output-dir core_metrics_results

qiime diversity alpha \
--i-table table.qza \
--p-metric chao1 \
--o-alpha-diversity core_metrics_results/chao1_vector.qza

qiime tools export --input-path core_metrics_results/shannon_vector.qza --output-path exported_alpha/shannon
qiime tools export --input-path core_metrics_results/faith_pd_vector.qza --output-path exported_alpha/faith_pd
qiime tools export --input-path core_metrics_results/observed_features_vector.qza --output-path exported_alpha/observed_features
qiime tools export --input-path core_metrics_results/chao1_vector.qza --output-path exported_alpha/chao1

qiime tools export \
--input-path core_metrics_results/unweighted_unifrac_distance_matrix.qza \
--output-path exported_unweighted_unifrac
qiime tools export \
--input-path core_metrics_results/weighted_unifrac_distance_matrix.qza \
--output-path exported_weighted_unifrac

qiime diversity beta-group-significance \
--i-distance-matrix core_metrics_results/weighted_unifrac_distance_matrix.qza \
--m-metadata-file qiime2_metadata.tsv \
--m-metadata-column Group \
--p-pairwise \
--p-method permanova \
--o-visualization beta_group_significance.qzv

qiime tools export \
--input-path beta_group_significance.qzv \
--output-path exported_beta_group

#✅ Group 1 / Group 2 — The pairwise comparisons.
#✅ Sample size — Number of samples used in the test.
#✅ Permutations — Number of permutations in the PERMANOVA test.
#✅ pseudo-F — The test statistic from PERMANOVA.
#✅ p-value — The unadjusted p-value.
#✅ q-value — The adjusted p-value (Bonferroni in QIIME2).
#The q-value column (also sometimes called p-adj) is the multiple-testing corrected p-value.
#👉 q < 0.05 means the difference is statistically significant between those two groups, even after correction.
#“There is a significant difference in community composition between Group 1 and Group 2 (p=0.002).”

#The --p-sampling-depth 6389 parameter is directly equivalent to QIIME 1’s -e 6389!
#The QIIME 2 command will compute:
#✅ Alpha diversity metrics (Observed OTUs, Shannon, Faith PD, Evenness)
#✅ Beta diversity distance matrices (UniFrac, Bray-Curtis)
#✅ PCoA plots
#✅ Excludes samples with fewer than 6,389 reads.

#📦 Output Folders and Files
#Output Description
#table.qzv  Visual of feature table (sample counts)
#rep-seqs.qzv   Sequences per ASV
#denoising-stats.qzv    DADA2 read tracking
#taxonomy.qza/.qzv  Taxonomic classification of ASVs
#taxa-bar-plots.qzv Interactive bar plots
#core_metrics_results/  Alpha/beta diversity metrics + PCoA plots

Prepare three files feeding to Phyloseq.Rmd: table.qza (see above with ), rooted-tree.qza (see above with ), qiime2_metadata_for_qza_to_phyloseq.tsv edited from qiime2_metadata.tsv.

# Rarefying can be performed here, or in Phyloseq.Rmd (default), therefore, we don't need this step any more.
qiime feature-table summarize \
--i-table core_metrics_results/rarefied_table.qza \
--o-visualization rarefied_table.qzv \
--m-sample-metadata-file qiime2_metadata.tsv

#Table summary
#Metric Sample
#Number of samples  136
#Number of features 2,781
#Total frequency    868,904

# In QIIME2, we need table.qza, not biom-file, therefore, we don't need this step any more.
qiime tools export \
--input-path core_metrics_results/rarefied_table.qza \
--output-path exported_rarefied_table
#-->
exported_rarefied_table/feature-table.biom

biom convert \
-i exported_rarefied_table/feature-table.biom \
-o exported_rarefied_table/feature-table.tsv \
--to-tsv

#✅ Old QIIME 1 table with GenBank IDs (like EF603722.1.1487) as feature labels.
#✅ QIIME 2 table where feature IDs are hashes (like 0b438323a296b5f2ce2c8bbe3949ee8d).

# Visulaize the taxonomy.qza
qiime tools export \
--input-path taxonomy.qza \
--output-path exported-taxonomy

#Feature ID    Taxon                               Confidence
#0b4383...     k__Bacteria; p__Proteobacteria...   0.98
#dfa833...     k__Bacteria; p__Firmicutes...       0.87
#...

# ---- I used the following to generate two file for feeding in the Phyloseq.Rmd ----

#-1- exported_table/feature-table.biom corresesponds to table_even6389.biom in QIIME1, but in QIIME2, we don't need biom-file, instead of table.qza.

qiime tools export \
--input-path dada2_tests2/test_7_f240_r240/table.qza \
--output-path exported_table
#--> exported_table/feature-table.biom

#-2- exported-tree/tree.nwk corresesponds to rep_set.tre in QIIME1

qiime tools export \
--input-path rooted-tree.qza \
--output-path exported-tree
#--> exported-tree/tree.nwk

# ---- The code in Phyloseq.Rmd ----

#install.packages("remotes")
#remotes::install_github("jbisanz/qiime2R")
#"core_metrics_results/rarefied_table.qza", rarefying performed in the code, therefore import the raw table.
library(qiime2R)
ps.ng.tax <- qza_to_phyloseq(
    features =  "dada2_tests2/test_7_f240_r240/table.qza",
    tree = "rooted-tree.qza",
    metadata = "qiime2_metadata_for_qza_to_phyloseq.tsv"
)
# or
#biom convert \
#      -i ./exported_table/feature-table.biom \
#      -o ./exported_table/feature-table-v1.biom \
#      --to-json
#ps.ng.tax <- import_biom("./exported_table/feature-table-v1.biom", treefilename="./exported-tree/tree.nwk")

#Note that the alpha- and beta-diversity-files needed in Phyloseq.Rmd has been prepared in the step 9.

Figures generated by Phyloseq.Rmd and MicrobiotaProcess_*.R

The following files can be found under server.

./Phyloseq.Rmd (Result Phyloseq.html)
./MicrobiotaProcess_cluster1_Group9-11_vs_cluster2_Group12-14_orig.R
./MicrobiotaProcess_Group1_vs_Group2.R
./MicrobiotaProcess_Group3_vs_Group4.R
./MicrobiotaProcess_PCA_Group1-4.R
./MicrobiotaProcess_PCA_Group9-14.R

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

Workflow using QIIME2 for Data_Marius_16S_2025

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章