gene_x 0 like s 8 view s
Tags: pipeline
Install and test qiime2-docker
#Cannot run under QIIME1, switch to QIIME2: pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i test.fna -o clustering_test/ -p clustering_params.txt --parallel --verbose
docker pull quay.io/qiime2/core:2023.9
docker run -it --rm \
-v /mnt/md1/DATA/Data_Marius_16S_2025:/data \
-v /home/jhuang/REFs:/home/jhuang/REFs \
quay.io/qiime2/core:2023.9 bash
cd /data
Import the fastq-files to paired-end-demux.qza
#https://docs.qiime2.org/2018.8/tutorials/importing/
qiime tools import --type 'SampleData[PairedEndSequencesWithQuality]' --input-path pe-33-manifest --output-path paired-end-demux.qza --input-format PairedEndFastqManifestPhred33
#--> 1095204304 Mai 27 15:11 paired-end-demux.qza
qiime demux summarize \
--i-data paired-end-demux.qza \
--o-visualization demux_pe.qzv
#https://view.qiime2.org
#qiime tools view demux_pe.qzv
Optimizing the parameters trunc-len-f and trunc-len-r and denoising with DADA2: optimized parameters is f240_r240
#Your amplicon (V3–V4 region) is ~464 bp, so you need ≥20–30 bp overlap
#464-38=426; 440 is the longst +12 nt for overlapping=we need 452 nt!
#Optimize the parameters --p-trunc-len-f and --p-trunc-len-r
./dada2_batch_test.sh
#!/bin/bash
# Set your base inputs
INPUT=paired-end-demux.qza
TRIM_LEFT_F=17
TRIM_LEFT_R=21
# Output base
OUTPUT_DIR=dada2_tests
mkdir -p $OUTPUT_DIR
# Loop over trunc-len-f and trunc-len-r combinations
# Forward: from 220 to 240
# Reverse: from 210 to 230
i=1
for f in 240 235 230 225; do
for r in 225 220 215; do
OUT=test_${i}_f${f}_r${r}
echo "Running: $OUT"
mkdir -p $OUTPUT_DIR/$OUT
qiime dada2 denoise-paired \
--i-demultiplexed-seqs $INPUT \
--p-trim-left-f $TRIM_LEFT_F \
--p-trim-left-r $TRIM_LEFT_R \
--p-max-ee-f 3 --p-max-ee-r 5 \
--p-trunc-len-f $f \
--p-trunc-len-r $r \
--p-n-threads 32 \
--o-table $OUTPUT_DIR/$OUT/table.qza \
--o-representative-sequences $OUTPUT_DIR/$OUT/rep-seqs.qza \
--o-denoising-stats $OUTPUT_DIR/$OUT/denoising-stats.qza \
--verbose > $OUTPUT_DIR/$OUT/log.txt 2>&1
((i++))
done
done
for f in dada2_tests2/test_*/denoising-stats.qza; do
qiime metadata tabulate \
--m-input-file $f \
--o-visualization ${f%.qza}.qzv
done
#pandaseq.out: grep ">" A1_R1.fastq.gz_merged.fasta | wc -l #8229; grep ">" A10_R1.fastq.gz_merged.fasta | wc -l #9165
# The best choice is f251_r251, since the first 17 and 21 bases with bad quality are anyway removed!
#f251_r251: sample-A1 18299 10989 60.05 10609 7129 38.96 6535 35.71; sample-A10 18736 12249 65.38 11778 7444 39.73 6978 37.24
#f251_r250: sample-A1 18299 11855 64.78 11435 7431 40.61 6823 37.29; sample-A10 18736 13092 69.88 12590 7714 41.17 7206 38.46
#f251_r245: sample-A1 18299 12642 69.09 12180 7860 42.95 7193 39.31; sample-A10 18736 13981 74.62 13457 8218 43.86 7649 40.83
#f251_r240: sample-A1 18299 12678 69.28 12214 8060 44.05 7387 40.37; sample-A10 18736 14018 74.82 13498 8412 44.9 7758 41.41
#f250_r240: sample-A1 18299 13705 74.89 13191 8796 48.07 7984 43.63
#f250_r235: sample-A1 18299 13716 74.95 13198 8793 48.05 7969 43.55
#f250_r230: sample-A1 18299 13739 75.08 13217 9023 49.31 8113 44.34; sample-A10 18736 14838 79.2 14159 8895 47.48 8101 43.24
#f245_r240: sample-A1 18299 14513 79.31 14019 9472 51.76 8739 47.76; sample-A10 18736 15609 83.31 15102 9605 51.26 8880 47.4
#f245_r235: sample-A1 18299 14524 79.37 14026 9485 51.83 8746 47.79; sample-A10 18736 15634 83.44 15127 9685 51.69 8869 47.34
#f245_r230: sample-A1 18299 14547 79.5 14045 8845 48.34 8058 44.04; sample-A10 18736 15664 83.6 15156 8812 47.03 7999 42.69
#f240_r240: sample-A1 18299 14647 80.04 14164 9869 53.93 8932 48.81; sample-A10 18736 15728 83.95 15213 10488 55.98 9081 48.47 *
#f240_r235: sample-A1 18299 14661 80.12 14172 9125 49.87 8194 44.78; sample-A10 18736 15755 84.09 15242 9579 51.13 8105 43.26
#f240_r230: sample-A1 18299 14686 80.26 14191 4952 27.06 4666 25.5; sample-A10 18736 15785 84.25 15267 3489 18.62 3341 17.83
#f240_r225: sample-A1 18299 14701 80.34 14206 4575 25 4297 23.48
#f240_r220: sample-A1 18299 14720 80.44 14223 4588 25.07 4310 23.55
#f240_r225: sample-A1 18299 14747 80.59 14250 3976 21.73 3758 20.54
#f230_r220: sample-A1 18299 14972 81.82 14514 3 0.02 3 0.02
#The output of the optimized denoising: (*) dada2_tests2/test_7_f240_r240/table.qza and dada2_tests2/test_7_f240_r240/rep-seqs.qza.
Visualize outputs
qiime feature-table summarize \
--i-table dada2_tests2/test_7_f240_r240/table.qza \
--o-visualization table.qzv \
--m-sample-metadata-file qiime2_metadata.tsv
#Table summary
#Metric Sample
#Number of samples 137
#Number of features 3,039
#Total frequency 1,641,484
#
#Frequency per sample
#Minimum frequency 413.0
#1st quartile 10,319.0
#Median frequency 11,530.0
#3rd quartile 13,146.0
#Maximum frequency 40,022.0
#Mean frequency 11,981.635036496351
#
#Frequency per feature
#Minimum frequency 1.0
#1st quartile 3.0
#Median frequency 8.0
#3rd quartile 95.5
#Maximum frequency 56,472.0
#Mean frequency 540.1395195788089
#qiime tools peek table.qza
#qiime tools peek qiime2_metadata.tsv
qiime feature-table tabulate-seqs \
--i-data dada2_tests2/test_7_f240_r240/rep-seqs.qza \
--o-visualization rep-seqs.qzv
qiime metadata tabulate \
--m-input-file dada2_tests2/test_7_f240_r240/denoising-stats.qza \
--o-visualization denoising-stats.qzv
Import reference sequences and taxonomy (SILVA 132)
qiime tools import \
--type 'FeatureData[Sequence]' \
--input-path /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna \
--output-path silva_132_99_otus.qza \
--input-format DNAFASTAFormat
qiime tools import \
--type 'FeatureData[Taxonomy]' \
--input-format HeaderlessTSVTaxonomyFormat \
--input-path /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt \
--output-path silva_132_99_taxonomy.qza
Assign taxonomy
qiime feature-classifier classify-consensus-vsearch \
--i-query dada2_tests2/test_7_f240_r240/rep-seqs.qza \
--i-reference-reads silva_132_99_otus.qza \
--i-reference-taxonomy silva_132_99_taxonomy.qza \
--p-perc-identity 0.97 \
--p-threads 64 \
--o-classification taxonomy.qza \
--o-search-results search-results.qza
Visualize taxonomy
qiime taxa barplot \
--i-table dada2_tests2/test_7_f240_r240/table.qza \
--i-taxonomy taxonomy.qza \
--m-metadata-file qiime2_metadata.tsv \
--o-visualization taxa-bar-plots.qzv
Build phylogenetic tree
qiime alignment mafft \
--i-sequences dada2_tests2/test_7_f240_r240/rep-seqs.qza \
--o-alignment aligned-rep-seqs.qza
qiime alignment mask \
--i-alignment aligned-rep-seqs.qza \
--o-masked-alignment masked-aligned-rep-seqs.qza
qiime phylogeny fasttree \
--i-alignment masked-aligned-rep-seqs.qza \
--o-tree unrooted-tree.qza
(*) qiime phylogeny midpoint-root \
--i-tree unrooted-tree.qza \
--o-rooted-tree rooted-tree.qza
Core diversity analysis
#The -e 6389 flag sets the even sampling depth (rarefaction depth) to 6,389 reads for diversity analyses.
#All samples will be rarefied to 4,753 reads.
#Samples with fewer reads are excluded.
qiime diversity core-metrics-phylogenetic \
--i-phylogeny rooted-tree.qza \
--i-table dada2_tests2/test_7_f240_r240/table.qza \
--p-sampling-depth 6389 \
--m-metadata-file qiime2_metadata.tsv \
--output-dir core_metrics_results
qiime diversity alpha \
--i-table table.qza \
--p-metric chao1 \
--o-alpha-diversity core_metrics_results/chao1_vector.qza
qiime tools export --input-path core_metrics_results/shannon_vector.qza --output-path exported_alpha/shannon
qiime tools export --input-path core_metrics_results/faith_pd_vector.qza --output-path exported_alpha/faith_pd
qiime tools export --input-path core_metrics_results/observed_features_vector.qza --output-path exported_alpha/observed_features
qiime tools export --input-path core_metrics_results/chao1_vector.qza --output-path exported_alpha/chao1
qiime tools export \
--input-path core_metrics_results/unweighted_unifrac_distance_matrix.qza \
--output-path exported_unweighted_unifrac
qiime tools export \
--input-path core_metrics_results/weighted_unifrac_distance_matrix.qza \
--output-path exported_weighted_unifrac
qiime diversity beta-group-significance \
--i-distance-matrix core_metrics_results/weighted_unifrac_distance_matrix.qza \
--m-metadata-file qiime2_metadata.tsv \
--m-metadata-column Group \
--p-pairwise \
--p-method permanova \
--o-visualization beta_group_significance.qzv
qiime tools export \
--input-path beta_group_significance.qzv \
--output-path exported_beta_group
#✅ Group 1 / Group 2 — The pairwise comparisons.
#✅ Sample size — Number of samples used in the test.
#✅ Permutations — Number of permutations in the PERMANOVA test.
#✅ pseudo-F — The test statistic from PERMANOVA.
#✅ p-value — The unadjusted p-value.
#✅ q-value — The adjusted p-value (Bonferroni in QIIME2).
#The q-value column (also sometimes called p-adj) is the multiple-testing corrected p-value.
#👉 q < 0.05 means the difference is statistically significant between those two groups, even after correction.
#“There is a significant difference in community composition between Group 1 and Group 2 (p=0.002).”
#The --p-sampling-depth 6389 parameter is directly equivalent to QIIME 1’s -e 6389!
#The QIIME 2 command will compute:
#✅ Alpha diversity metrics (Observed OTUs, Shannon, Faith PD, Evenness)
#✅ Beta diversity distance matrices (UniFrac, Bray-Curtis)
#✅ PCoA plots
#✅ Excludes samples with fewer than 6,389 reads.
#📦 Output Folders and Files
#Output Description
#table.qzv Visual of feature table (sample counts)
#rep-seqs.qzv Sequences per ASV
#denoising-stats.qzv DADA2 read tracking
#taxonomy.qza/.qzv Taxonomic classification of ASVs
#taxa-bar-plots.qzv Interactive bar plots
#core_metrics_results/ Alpha/beta diversity metrics + PCoA plots
Prepare three files feeding to Phyloseq.Rmd: table.qza (see above with ), rooted-tree.qza (see above with ), qiime2_metadata_for_qza_to_phyloseq.tsv edited from qiime2_metadata.tsv.
# Rarefying can be performed here, or in Phyloseq.Rmd (default), therefore, we don't need this step any more.
qiime feature-table summarize \
--i-table core_metrics_results/rarefied_table.qza \
--o-visualization rarefied_table.qzv \
--m-sample-metadata-file qiime2_metadata.tsv
#Table summary
#Metric Sample
#Number of samples 136
#Number of features 2,781
#Total frequency 868,904
# In QIIME2, we need table.qza, not biom-file, therefore, we don't need this step any more.
qiime tools export \
--input-path core_metrics_results/rarefied_table.qza \
--output-path exported_rarefied_table
#-->
exported_rarefied_table/feature-table.biom
biom convert \
-i exported_rarefied_table/feature-table.biom \
-o exported_rarefied_table/feature-table.tsv \
--to-tsv
#✅ Old QIIME 1 table with GenBank IDs (like EF603722.1.1487) as feature labels.
#✅ QIIME 2 table where feature IDs are hashes (like 0b438323a296b5f2ce2c8bbe3949ee8d).
# Visulaize the taxonomy.qza
qiime tools export \
--input-path taxonomy.qza \
--output-path exported-taxonomy
#Feature ID Taxon Confidence
#0b4383... k__Bacteria; p__Proteobacteria... 0.98
#dfa833... k__Bacteria; p__Firmicutes... 0.87
#...
# ---- I used the following to generate two file for feeding in the Phyloseq.Rmd ----
#-1- exported_table/feature-table.biom corresesponds to table_even6389.biom in QIIME1, but in QIIME2, we don't need biom-file, instead of table.qza.
qiime tools export \
--input-path dada2_tests2/test_7_f240_r240/table.qza \
--output-path exported_table
#--> exported_table/feature-table.biom
#-2- exported-tree/tree.nwk corresesponds to rep_set.tre in QIIME1
qiime tools export \
--input-path rooted-tree.qza \
--output-path exported-tree
#--> exported-tree/tree.nwk
# ---- The code in Phyloseq.Rmd ----
#install.packages("remotes")
#remotes::install_github("jbisanz/qiime2R")
#"core_metrics_results/rarefied_table.qza", rarefying performed in the code, therefore import the raw table.
library(qiime2R)
ps.ng.tax <- qza_to_phyloseq(
features = "dada2_tests2/test_7_f240_r240/table.qza",
tree = "rooted-tree.qza",
metadata = "qiime2_metadata_for_qza_to_phyloseq.tsv"
)
# or
#biom convert \
# -i ./exported_table/feature-table.biom \
# -o ./exported_table/feature-table-v1.biom \
# --to-json
#ps.ng.tax <- import_biom("./exported_table/feature-table-v1.biom", treefilename="./exported-tree/tree.nwk")
#Note that the alpha- and beta-diversity-files needed in Phyloseq.Rmd has been prepared in the step 9.
Figures generated by Phyloseq.Rmd and MicrobiotaProcess_*.R
The following files can be found under server.
./Phyloseq.Rmd (Result Phyloseq.html)
./MicrobiotaProcess_cluster1_Group9-11_vs_cluster2_Group12-14_orig.R
./MicrobiotaProcess_Group1_vs_Group2.R
./MicrobiotaProcess_Group3_vs_Group4.R
./MicrobiotaProcess_PCA_Group1-4.R
./MicrobiotaProcess_PCA_Group9-14.R
点赞本文的读者
还没有人对此文章表态
没有评论
Variant calling for Data_Pietschmann_229ECoronavirus_Mutations_2025 (via docker own_viral_ngs)
How to debug and construct the docker docker own_viral_ngs?
Visualization and Export of miRNA Expression Profiles Using Manhattan Plots in R
Analysis of the RNA binding protein (RBP) motifs for RNA-Seq and miRNAs (v3)
© 2023 XGenes.com Impressum