gene_x 0 like s 474 view s
Tags: processing, NGS, pipeline
Raw_Data
#Here are some more information on the two sample collections for epidome sequencing: 9+10+33=52
#Samples 7N-15N are nose swabs of 9 individual patients, and 16-20F/N are feet (F) and nose (N) swabs of 5 more patients.
#All of these patients were hospitalized for endoprosthesis surgery.
#nose swaps of 9 individual patients
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1436/7N_S1_R1_001.fastq.gz P7-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1436/7N_S1_R2_001.fastq.gz P7-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1437/8N_S2_R1_001.fastq.gz P8-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1437/8N_S2_R2_001.fastq.gz P8-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1438/9N_S3_R1_001.fastq.gz P9-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1438/9N_S3_R2_001.fastq.gz P9-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1439/10N_S4_R1_001.fastq.gz P10-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1439/10N_S4_R2_001.fastq.gz P10-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1440/11N_S5_R1_001.fastq.gz P11-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1440/11N_S5_R2_001.fastq.gz P11-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1441/12N_S6_R1_001.fastq.gz P12-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1441/12N_S6_R2_001.fastq.gz P12-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1442/13N_S7_R1_001.fastq.gz P13-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1442/13N_S7_R2_001.fastq.gz P13-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1443/14N_S8_R1_001.fastq.gz P14-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1443/14N_S8_R2_001.fastq.gz P14-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1444/15N_S9_R1_001.fastq.gz P15-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1444/15N_S9_R2_001.fastq.gz P15-Nose_R2.fastq.gz
#16-20F/N are feet (F) and nose (N) swabs of 5 more patients 10
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1445/16F_S10_R1_001.fastq.gz P16-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1445/16F_S10_R2_001.fastq.gz P16-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1446/16N_S11_R1_001.fastq.gz P16-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1446/16N_S11_R2_001.fastq.gz P16-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1447/17F_S12_R1_001.fastq.gz P17-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1447/17F_S12_R2_001.fastq.gz P17-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1448/17N_S13_R1_001.fastq.gz P17-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1448/17N_S13_R2_001.fastq.gz P17-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1449/18F_S14_R1_001.fastq.gz P18-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1449/18F_S14_R2_001.fastq.gz P18-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1450/18N_S15_R1_001.fastq.gz P18-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1450/18N_S15_R2_001.fastq.gz P18-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1451/19F_S16_R1_001.fastq.gz P19-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1451/19F_S16_R2_001.fastq.gz P19-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1452/19N_S17_R1_001.fastq.gz P19-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1452/19N_S17_R2_001.fastq.gz P19-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1453/20F_S18_R1_001.fastq.gz P20-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1453/20F_S18_R2_001.fastq.gz P20-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1454/20N_S19_R1_001.fastq.gz P20-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1454/20N_S19_R2_001.fastq.gz P20-Nose_R2.fastq.gz
#Samples 1-108 are swabs of noses, lesioned skin (LH) and non-lesioned skin (NLH) of 11 patients suffering from atopic dermatitis. 33
#There are more details on these samples in the attached excel file.
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1455/1_S20_R1_001.fastq.gz RP-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1455/1_S20_R2_001.fastq.gz RP-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1456/2_S21_R1_001.fastq.gz RP-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1456/2_S21_R2_001.fastq.gz RP-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1457/3_S22_R1_001.fastq.gz RP-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1457/3_S22_R2_001.fastq.gz RP-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1458/4_S23_R1_001.fastq.gz AL-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1458/4_S23_R2_001.fastq.gz AL-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1459/5_S24_R1_001.fastq.gz AL-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1459/5_S24_R2_001.fastq.gz AL-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1460/6_S25_R1_001.fastq.gz AL-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1460/6_S25_R2_001.fastq.gz AL-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1461/22_S26_R1_001.fastq.gz MC-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1461/22_S26_R2_001.fastq.gz MC-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1462/23_S27_R1_001.fastq.gz MC-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1462/23_S27_R2_001.fastq.gz MC-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1463/24_S28_R1_001.fastq.gz MC-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1463/24_S28_R2_001.fastq.gz MC-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1464/25_S29_R1_001.fastq.gz SA-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1464/25_S29_R2_001.fastq.gz SA-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1465/26_S30_R1_001.fastq.gz SA-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1465/26_S30_R2_001.fastq.gz SA-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1466/27_S31_R1_001.fastq.gz SA-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1466/27_S31_R2_001.fastq.gz SA-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1467/28_S32_R1_001.fastq.gz HR-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1467/28_S32_R2_001.fastq.gz HR-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1468/29_S33_R1_001.fastq.gz HR-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1468/29_S33_R2_001.fastq.gz HR-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1469/30_S34_R1_001.fastq.gz HR-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1469/30_S34_R2_001.fastq.gz HR-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1470/34_S35_R1_001.fastq.gz XN-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1470/34_S35_R2_001.fastq.gz XN-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1471/35_S36_R1_001.fastq.gz XN-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1471/35_S36_R2_001.fastq.gz XN-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1472/36_S37_R1_001.fastq.gz XN-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1472/36_S37_R2_001.fastq.gz XN-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1487/AY_S52_R1_001.fastq.gz MR-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1487/AY_S52_R2_001.fastq.gz MR-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1473/50_S38_R1_001.fastq.gz MR-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1473/50_S38_R2_001.fastq.gz MR-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1474/51_S39_R1_001.fastq.gz MR-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1474/51_S39_R2_001.fastq.gz MR-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1475/58_S40_R1_001.fastq.gz CB-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1475/58_S40_R2_001.fastq.gz CB-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1476/59_S41_R1_001.fastq.gz CB-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1476/59_S41_R2_001.fastq.gz CB-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1477/60_S42_R1_001.fastq.gz CB-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1477/60_S42_R2_001.fastq.gz CB-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1478/94_S43_R1_001.fastq.gz KK-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1478/94_S43_R2_001.fastq.gz KK-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1479/95_S44_R1_001.fastq.gz KK-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1479/95_S44_R2_001.fastq.gz KK-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1480/96_S45_R1_001.fastq.gz KK-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1480/96_S45_R2_001.fastq.gz KK-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1481/103_S46_R1_001.fastq.gz AH-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1481/103_S46_R2_001.fastq.gz AH-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1482/104_S47_R1_001.fastq.gz AH-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1482/104_S47_R2_001.fastq.gz AH-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1483/105_S48_R1_001.fastq.gz AH-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1483/105_S48_R2_001.fastq.gz AH-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1484/106_S49_R1_001.fastq.gz PT2-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1484/106_S49_R2_001.fastq.gz PT2-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1485/107_S50_R1_001.fastq.gz PT2-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1485/107_S50_R2_001.fastq.gz PT2-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1486/108_S51_R1_001.fastq.gz PT2-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1486/108_S51_R2_001.fastq.gz PT2-NLH_R2.fastq.gz
Processing for Epidome yycH and g216
#-->92-301
#yycH: 476 + 44 -->minLen=200 (76)
#g216: 448 + 44 -->minLen=185 (78)
#16S: 410-427 + 44 -->minLen=170 (70)
#360/2=180 *
#200 and 200 *
#>seq1;
#TGGGTATGGCAATCACTTTACA
#AGAATTCTATATTAAAGATGTTCTAATTGTGGAAAAGGGATCCATCGGTCATTCATTTAAACATTGGCCTCTATCAACAAAGACCATCACACCATCATTTACAACTAATGGTTTTGGCATGCCAGATATGAATGCAATAGCTAAAGATACATCACCTGCCTTCACTTTCAATGAAGAACATTTGTCTGGAAATAATTACGCTCAATACATTTCATTAGTAGCTGAGCATTACAATCTAAATGTCAAAACAAATACCAATGTTTCACGTGTAACATACATAGATGGTATATATCATGTATCAACGGACTATGGTGTTTATACCGCAGATTATATATTTATAGCAACTGGAGACTATTCATTCCCATATCATCCTTTTTCATATGGACGTCATTACAGTGAGATTCGAGCGTTCACTCAATTAAACGGTGACGCCTTTACAATTATTGGA GGTAATGAGAGTGCTTTTGATGC
#>M03701:292:000000000-K9M88:1:1101:10277:1358:AAGAGGCA+TATCCTCT
#TGGGTATGRCAATCACTTTACA
#AGAATTCAATATTAAAGATGTTCTAATTGTTGAAAAGGGAACCATCGGTCATTCATTTAAACATTGGCCTCTATCAACAAAGACCATCACACCATCATTTACAACTAATGGTTTTGGCATGCCAGATATGAATGCAATAGCTAAAGATACATCACCTGCCTTCACTTTCAATGAAGAACATTTATCTGGAAAACGTTATGCTGAATACCTCTCACTAGTAGCTACGCATTACAATCTAAATGGCAAAACAAACACCAATGTTTCACGTGTAACATACATAGATGGTGTATATCATGTATCAACGGACTATGGTGTTTATACCGCAGATTATATATTTATAGCAACTGGAGACTATTCATTCCCATATCATCCTTTATCATATGGACGTCATTACAGTGAAATTCAAACATTCACTCAATTAAAAGGTGATGCTTTTACAATCATTGGT GGTAATGAGAGTGCTTTTGATGC
#DIR: ~/DATA/Data_Holger_Epidome/testrun2
#Input: epidome->/home/jhuang/Tools/epidome and rawdata
#Read in 37158 paired-sequences, output 31225 (84%) filtered paired-sequences.
#Read in 82145 paired-sequences, output 78594 (95.7%) filtered paired-sequences.
#-->
#Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R1/A10-1_R1.fastq.gz
#Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R2/A10-1_R2.fastq.gz
#Read in 37158 paired-sequences, output 35498 (95.5%) filtered paired-sequences.
#Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R1/A10-2_R1.fastq.gz
#Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R2/A10-2_R2.fastq.gz
#Read in 82145 paired-sequences, output 80918 (98.5%) filtered paired-sequences.
#
#Read in 46149 paired-sequences, output 22206 (48.1%) filtered paired-sequences.
#Read in 197875 paired-sequences, output 168942 (85.4%) filtered paired-sequences.
#Read in 230646 paired-sequences, output 201376 (87.3%) filtered paired-sequences.
#Read in 175759 paired-sequences, output 149823 (85.2%) filtered paired-sequences.
#Read in 147546 paired-sequences, output 128864 (87.3%) filtered paired-sequences.
2.1. quality controls (optional)
#under testrun2 should have
#BiocManager::install("dada2")
library(dada2); packageVersion("dada2")
path <- "/home/jhuang/DATA/Data_Luise_Epidome_batch3/raw_data" # CHANGE ME to the directory containing the fastq files after unzipping.
list.files(path)
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2.fastq.gz", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
png("quality_fnFs.png", width=800, height=800)
plotQualityProfile(fnFs[1:2])
dev.off()
png("quality_fnRs.png", width=800, height=800)
plotQualityProfile(fnRs[1:2])
dev.off()
2.2. cutadapt instead of Trimmomatic (namely demultiplexing, see epidome/scripts/EPIDOME_yycH_cutadapt_loop.sh)
#Output: cutadapted_yycH cutadapted_g216 cutadapted_16S
#Script: epidome/scripts/EPIDOME_yycH_cutadapt_loop.sh
#5′-CGATGCKAAAGTGCCGAATA-3′/5′-CTTCATTTAAGAAGCCACCWTGACT-3′ for yycH
#5′-TGGGTATGRCAATCACTTTACA-3′/5′-GCATCAAAAGCACTCTCATTACC-3′ for g216
#-p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC -l 300 for 16S
mkdir cutadapted_yycH cutadapted_g216 cutadapted_16S
cd raw_data
#The default is --action=trim. With --action=retain, the read is trimmed, but the adapter sequence itself is not removed.
for file in *_R1.fastq.gz; do
cutadapt -e 0.06 -g CGATGCKAAAGTGCCGAATA -G CTTCATTTAAGAAGCCACCWTGACT --pair-filter=any -o ../cutadapted_yycH/${file} --paired-output ../cutadapted_yycH/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
done
for file in *_R1.fastq.gz; do
cutadapt -e 0.06 -g TGGGTATGRCAATCACTTTACA -G GCATCAAAAGCACTCTCATTACC --pair-filter=any -o ../cutadapted_g216/${file} --paired-output ../cutadapted_g216/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
done
for file in *_R1.fastq.gz; do
cutadapt -e 0.06 -g CCTACGGGNGGCWGCAG -G GACTACHVGGGTATCTAATCC --pair-filter=any -o ../cutadapted_16S/${file} --paired-output ../cutadapted_16S/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
done
2.3. (IGNORED) regenerate filtered_R1 and filtered_R2 (under conda env qiime1 using pear) --> IGNORED, since we should use filterAndTrim from data2 in the next step!)
#DEPRECATED: mkdir pandaseq_16S pandaseq_yycH pandaseq_g216
#DEPRECATED: -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC
#DEPRECATED: for file in cutadapted_16S/*_R1.fastq.gz; do pandaseq -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -l 300 -w pandaseq_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq_16S; done
#https://learnmetabarcoding.github.io/LearnMetabarcoding/processing/pair_merging.html#
#conda install -c conda-forge -c bioconda -c defaults seqkit
mkdir pear_yycH pear_g216 pear_16S
for file in cutadapted_yycH/*_R1.fastq.gz; do pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 4 -q 26 -v 10 -o pear_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_yycH; done
for file in cutadapted_g216/*_R1.fastq.gz; do pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 2 -q 26 -v 10 -o pear_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_g216; done
for file in cutadapted_16S/*_R1.fastq.gz; do pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 2 -q 26 -v 10 -o pear_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_16S; done
for file in cutadapted_yycH/*_R1.fastq.gz; do
grep "@M0370" pear_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
sed -i -e 's/@//g' cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
cut -d' ' -f1 cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
seqkit grep -f cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_yycH/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
seqkit grep -f cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_yycH/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
done
#>>LOG_pear_yycH
for file in cutadapted_g216/*_R1.fastq.gz; do
grep "@M0370" pear_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
sed -i -e 's/@//g' cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
cut -d' ' -f1 cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
seqkit grep -f cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_g216/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
seqkit grep -f cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_g216/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
done
for file in cutadapted_16S/*_R1.fastq.gz; do
grep "@M0370" pear_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
sed -i -e 's/@//g' cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
cut -d' ' -f1 cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
seqkit grep -f cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_16S/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
seqkit grep -f cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_16S/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
done
2.4. filtering+trimming+merging+chimera-removing (VERY_IMPORTANT: under conda env r4-base)
#Input: cutadapted_yycH, cutadapted_g216
#Outputs: yycH[g216|16S]_seqtab_from_dada2.rds
# yycH[g216|16S]_seqtab_from_dada2.csv
# yycH[g216|16S]_seqtab_nochim.rds
# yycH[g216|16S]_seqtab_nochim.csv
# yycH[g216|16S]_seqtab_image.RData
# track_yycH[g216|16S].csv -->
#The following scripts are modified from epidome/scripts/dada2_for_EPIDOME_yycH_runwise_pipeline.R
#./my_EPIDOME_g216_runwise_pipeline.R #minLen=185, with FILTERING, it does not work!
#./my_EPIDOME_yycH_runwise_pipeline.R #minLen=200, with FILTERING, it does not work!
./my_EPIDOME_g216_runwise_pipeline_.R > g216_runwise_pipeline_.LOG #minLen=185. NO FILTERING ANY MORE, since the input are filtered_R1 and filtered_R2.
./my_EPIDOME_yycH_runwise_pipeline_.R > yycH_runwise_pipeline_.LOG #minLen=200. NO FILTERING ANY MORE, since the input are filtered_R1 and filtered_R2.
./my_EPIDOME_16S_runwise_pipeline.R > 16S_runwise_pipeline.LOG #minLen=170. IGNORED since the 16S reads will be processed separately as below.
#END
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AH-LH_R1.fastq.gz
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AH-LH_R2.fastq.gz
#Read in 90261 paired-sequences, output 89026 (98.6%) filtered paired-sequences.
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AH-NLH_R1.fastq.gz
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AH-NLH_R2.fastq.gz
#Read in 74638 paired-sequences, output 73633 (98.7%) filtered paired-sequences.
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AH-Nose_R1.fastq.gz
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AH-Nose_R2.fastq.gz
#Read in 71311 paired-sequences, output 70542 (98.9%) filtered paired-sequences.
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AL-LH_R1.fastq.gz
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AL-LH_R2.fastq.gz
#Read in 94807 paired-sequences, output 93082 (98.2%) filtered paired-sequences.
~/Tools/csv2xls-0.4/csv_to_xls.py g216_track.csv yycH_track.csv 16S_track.csv -d$';' -o overview.xls;
# Read the CSV file into a DataFrame
df <- read.csv("g216_seqtab_from_dada2_nohead.csv", sep=";", row.name=1, header=FALSE)
#df <- read.csv("g216_seqtab_nochim.csv", sep=";", row.name=1)
# Calculate the sum for each row
row_sums <- rowSums(df)
# Print the row sums
print(row_sums)
> print(row_sums)
AH-LH AH-NLH AH-Nose AL-LH AL-NLH AL-Nose CB-LH CB-NLH
87232 69741 89689 76660 94636 108810 73814 56312
CB-Nose HR-LH HR-NLH HR-Nose KK-LH KK-NLH KK-Nose MC-LH
61740 76216 63165 55479 78550 87579 83826 73738
MC-NLH MC-Nose MR-LH MR-NLH MR-Nose P10-Nose P11-Nose P12-Nose
100338 94956 88158 63054 82361 103782 108533 90398
P13-Nose P14-Nose P15-Nose P16-Foot P16-Nose P17-Foot P17-Nose P18-Foot
87059 95656 110207 67058 77606 58339 95407 87775
P18-Nose P19-Foot P19-Nose P20-Foot P20-Nose P7-Nose P8-Nose P9-Nose
107560 79373 99571 104667 109457 101528 99565 147485
PT2-LH PT2-NLH PT2-Nose RP-LH RP-NLH RP-Nose SA-LH SA-NLH
81074 89345 77121 68946 71414 113722 53465 40966
SA-Nose XN-LH XN-NLH XN-Nose
33381 73842 76028 80630
AH-LH AH-NLH AH-Nose AL-LH AL-NLH AL-Nose CB-LH CB-NLH
79615 52143 57780 70801 89636 99428 52185 54848
CB-Nose HR-LH HR-NLH HR-Nose KK-LH KK-NLH KK-Nose MC-LH
50249 58365 45201 38747 56027 65755 53110 72355
MC-NLH MC-Nose MR-LH MR-NLH MR-Nose P10-Nose P11-Nose P12-Nose
97363 69881 68599 52386 48364 84634 78491 71877
P13-Nose P14-Nose P15-Nose P16-Foot P16-Nose P17-Foot P17-Nose P18-Foot
81694 84290 100606 62621 66484 50015 93498 73730
P18-Nose P19-Foot P19-Nose P20-Foot P20-Nose P7-Nose P8-Nose P9-Nose
93713 63755 70420 81030 105601 94352 70765 124449
PT2-LH PT2-NLH PT2-Nose RP-LH RP-NLH RP-Nose SA-LH SA-NLH
56869 74951 62339 68726 71145 111279 47754 36180
SA-Nose XN-LH XN-NLH XN-Nose
26130 55647 69993 50727
#wc -l cutadapted_yycH/filtered_R1$ vim Extraction-control-2_R1.fastq.gz #-->2696
#Read in 1138 paired-sequences, output 674 (59.2%) filtered paired-sequences.
#"Extraction-control-2";0;61;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;591;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 -->600 sequences
#"Extraction-control-2";0;61;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;591;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 #-->after chimera-removing, only 107 sequences
#Processing: Extraction-control-2
#Sample 1 - 674 reads in 209 unique sequences (What does the unique sequences mean???). #merged sequences are 209
#Sample 1 - 674 reads in 280 unique sequences.
#"";"input_read_count"*; "filtered_and_trimmed_read_count";"merged_after_dada2_read_count"; "non-chimeric_read_count"*
#"Extraction-control-2"; 1138*; 674;652; 652*
#biom convert -i table.txt -o table.from_txt_json.biom --table-type="OTU table" --to-json
#summarize_taxa_through_plots.py -i clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -o plots/taxa_summary -s
#summarize_taxa.py -i otu_table.biom -o ./tax
#g216_track.csv
"A10-1";36009;493;367;367
"A10-2";175221;82727;30264;27890
"A10-3";110170;36812;13715;13065
"A10-4";142323;64398;24306;21628
#yycH_track.csv
"A10-1";37158;549;0;0 pandaseq:36791
"A10-2";82145;23953;6180;5956
"A10-3";53438;12480;2944;2944
"A10-4";64516;18361;12350;11797
#16S_track.csv
"A10-1";46149(in cutadapted_16S);13(in filtered_R1);8;8
"A10-2";197875;2218;1540;1391
"A10-3";230646;2429;1819;1752
"A10-4";175759;2001;1439;1366
#zcat A10-1_R1.fastq.gz | echo $((`wc -l`/4))
#121007 > 37158 + 36009 + 46149 = 119316
#458392 > 82145 + 175221 + 197875 = 455241
2.5. (IGNORED) stitching and removing chimeras (see ~/DATA/Data_Holger_Epidome/epidome/scripts/Combine_and_Remove_Chimeras_yycH.R)
#my_Combine_and_Remove_Chimeras_g216.R is a part of my_EPIDOME_yycH_runwise_pipeline.R (see lines 53-55) --> IGNORED!
2.6. Classification: epidome/scripts/ASV_blast_classification.py
#Input: g216_seqtab_nochim.csv using DATABASE epidome/DB/g216_ref_aln.fasta
#Output: g216_seqtab_ASV_seqs.fasta, g216_seqtab_ASV_blast.txt and g216_seqtab.csv.classified.csv
python3 epidome/scripts/ASV_blast_classification.py yycH_seqtab_nochim.csv yycH_seqtab_ASV_seqs.fasta epidome/DB/yycH_ref_aln.fasta yycH_seqtab_ASV_blast.txt yycH_seqtab.csv.classified.csv 99.5
python3 epidome/scripts/ASV_blast_classification.py g216_seqtab_nochim.csv g216_seqtab_ASV_seqs.fasta epidome/DB/g216_ref_aln.fasta g216_seqtab_ASV_blast.txt g216_seqtab.csv.classified.csv 99.5
#old: python3 epidome/scripts/ASV_blast_classification.py yycH_seqtab.csv yycH_seqtab.csv.ASV_seqs.fasta epidome/DB/yycH_ref_aln.fasta yycH_seqtab.csv.ASV_blast.txt yycH_seqtab.csv.classified.csv 99.5
#old: python3 epidome/scripts/ASV_blast_classification_combined.py -p1 190920_run1_yycH_seqtab_from_dada2.csv -p2 190920_run1_G216_seqtab_from_dada2.csv -p1_ref epidome/DB/yycH_ref_aln.fasta -p2_ref epidome/DB/g216_ref_aln.fasta
##rename "seqseq2" --> seq2
#sed -i -e s/seq//g 190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt
#sed -i -e s/seqseq/seq/g 190920_run1_yycH_seqtab_from_dada2.csv.classified.csv
#diff 190920_run1_yycH_seqtab_from_dada2.csv.ASV_seqs.fasta epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.ASV_seqs.fasta
#diff 190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt
#diff 190920_run1_yycH_seqtab_from_dada2.csv.classified.csv epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.classified.csv
## WHY: 667 seqs in old calculation, but in our calculation only 108 seqs
## They took *_seqtab_from_dada2.csv, but we took *_seqtab_nochim.csv. (653 vs 108 records!)
##AAAT";"seq37,36";0;
sed -i -e s/seq//g yycH_seqtab_ASV_blast.txt #length=476
sed -i -e s/seq//g g216_seqtab_ASV_blast.txt #length=448
#;-->""
sed -i -e s/';'//g yycH_seqtab_ASV_blast.txt
sed -i -e s/';'//g g216_seqtab_ASV_blast.txt
sed -i -e s/seqseq/seq/g yycH_seqtab.csv.classified.csv
sed -i -e s/seqseq/seq/g g216_seqtab.csv.classified.csv
#;,seq --> ,seq
#;"; --> ";
sed -i -e s/";,seq"/",seq"/g yycH_seqtab.csv.classified.csv
sed -i -e s/";,seq"/",seq"/g g216_seqtab.csv.classified.csv
sed -i -e s/";\";"/"\";"/g yycH_seqtab.csv.classified.csv
sed -i -e s/";\";"/"\";"/g g216_seqtab.csv.classified.csv
#"ASV";"Seq_number";"even-mock3-1_S258_L001";"even-mock3-2_S282_L001";"even-mock3-3_S199_L001";"staggered-mock3-1_S270_L001";"staggered-mock3-2_S211_L001";"staggered-mock3-3_S223_L001"
#"ASV";"Seq_number";"Extraction_control_1";"Extraction_control_2";"P01_nose_1";"P01_nose_2";"P01_skin_1";"P01_skin_2";"P02_nose_1";"P02_nose_2";"P02_skin_1";"P02_skin_2";"P03_nose_1";"P03_nose_2";"P03_skin_1";"P03_skin_2";"P04_nose_1";"P04_nose_2";"P04_skin_1";"P04_skin_2";"P05_nose_1";"P05_nose_2";"P05_skin_1";"P05_skin_2";"P06_nose_1";"P06_nose_2";"P06_skin_1";"P06_skin_2";"P07_nose_1";"P07_nose_2";"P07_skin_1";"P07_skin_2";"P08_nose_1";"P08_nose_2";"P08_skin_1";"P08_skin_2";"P09_nose_1";"P09_nose_2";"P09_skin_1";"P09_skin_2";"P10_nose_1";"P10_nose_2";"P10_skin_1";"P10_skin_2";"P11_nose_1";"P11_nose_2";"P11_skin_1";"P11_skin_2";"even-mock3-1_S258_L001";"even-mock3-2_S282_L001";"even-mock3-3_S199_L001";"staggered-mock3-1_S270_L001";"staggered-mock3-2_S211_L001";"staggered-mock3-3_S223_L001"
grep -v ";NA;" g216_seqtab.csv.classified.csv > g216_seqtab.csv.classified_noNA.csv
grep -v ";NA;" yycH_seqtab.csv.classified.csv > yycH_seqtab.csv.classified_noNA.csv
https://github.com/ssi-dk/epidome/blob/master/example_data/190920_run1_G216_seqtab_from_dada2.csv.classified.csv
#DEBUG using LibreOffice, e.g. libreoffice --calc yycH_seqtab.csv.classified_noNA.csv after adding "ID"; at the corner.
seq24,seq21 --> seq24,21
#(OPTIONAL) TO reduce the unclassified, rename seq31,30 --> seq in g216, seq37,36 --> seq in yycH.
2.7. draw plot from three amplicons: cutadapted_g216, cutadapted_yycH, and cutadapted_16S
#Taxonomic database setup and classification
#- Custom databases of all unique g216 and yycH target sequences can be found at https://github.com/ssi-dk/epidome/tree/master/DB.
#- We formatted our g216 and yycH gene databases to be compatible with DADA2’s assign-Taxonomy function and used it to classify the S. epidermidis ASVs with the RDP naive Bayesian classifier method (https://github.com/ssi-dk/epidome/tree/master/scripts).
#- ST classification of samples was performed using the g216 target sequence as the primary identifier.
#- All g216 sequences unique to a single clonal cluster in the database were immediately classified as the matching clone, and in cases were the g216 sequence matched multiple clones, the secondary yycH target sequences were parsed to determine which clone was present. When this classification failed to resolve due to multiple potential combinations of sequences, ASVs were categorized as “Unclassified”. Similarly, g216 sequences not found in the database were labelled as “Novel”.
#~/Tools/csv2xls-0.4/csv_to_xls.py g216_seqtab.csv.classified_noNA.csv yycH_seqtab.csv.classified_noNA.csv -d$'\t' -o counts.xls
#under r4-base
source("epidome/scripts/epidome_functions.R")
ST_amplicon_table = read.table("epidome/DB/epidome_ST_amplicon_frequencies.txt",sep = "\t")
#"ST" "Group" "epi01_ASV" "epi02_ASV" "freq"
#"8888" "324" 113 1 1 2
#"815097" "5" 48 40 2 55
#"846906" "225" 61 3 3 1
#"846960" "-" 62 3 3 1
#"847064" "225" 63 3 3 2
#"847222" "225" 65 3 3 3
#"865555" "278" 37 5 3 4
epi01_table = read.table("g216_seqtab.csv.classified_noNA.csv",sep = "\t",header=TRUE,row.names=1)
epi02_table = read.table("yycH_seqtab.csv.classified_noNA.csv",sep = "\t",header=TRUE,row.names=1)
#> sum(epi01_table$AH.LH)
#[1] 78872
#> sum(epi02_table$AH.LH)
#[1] 86949
#construct metadata.txt as follows.
#"sample.ID" "patient.ID" "sample.site" "sample.type" "patient.sample.site"
#P7.Nose P7 Nose swab P7.Nose.swab
#P8.Nose P8 Nose swab P8.Nose.swab
#P9.Nose P9 Nose swab P9.Nose.swab
#P10.Nose P10 Nose swab P10.Nose.swab
#P11.Nose P11 Nose swab P11.Nose.swab
#P12.Nose P12 Nose swab P12.Nose.swab
#P13.Nose P13 Nose swab P13.Nose.swab
metadata_table = read.table("metadata.txt",header=TRUE,row.names=1)
metadata_table$patient.ID <- factor(metadata_table$patient.ID, levels=c("P7","P8","P9","P10","P11","P12","P13","P14","P15","P16","P17","P18","P19","P20", "AH","AL","CB","HR","KK", "MC","MR","PT2","RP","SA","XN"))
epidome_object = setup_epidome_object(epi01_table,epi02_table,metadata_table = metadata_table)
#Image1
primer_compare = compare_primer_output(epidome_object,color_variable = "sample.type")
png("image1.png")
primer_compare$plot
dev.off()
eo_ASV_combined = combine_ASVs_epidome(epidome_object)
eo_filtered = filter_lowcount_samples_epidome(eo_ASV_combined,500,500)
count_table = classify_epidome(eo_ASV_combined,ST_amplicon_table)
#count_df_ordered = count_table[order(rowSums(count_table),decreasing = T),]
#install.packages("pls")
#library(pls)
#install.packages("reshape")
#library(reshape)
#install.packages("vegan")
library('vegan')
library(scales)
library(RColorBrewer)
#Image2
#TODO: find out what are the combination 21006 in Aachen?
source("epidome/scripts/epidome_functions.R")
#count_table = count_table[-29,]
#row.names(count_table) <- c("-", "ST297", "ST170", "ST73", "ST225", "ST673", "ST215", "ST19", "Unclassified")
#row.names(count_table) <- c("NA", "-", "X297", "X170", "X73", "X225", "X673", "X215", "X19", "Unclassified")
row.names(count_table) <- c("ST215","ST130","ST278","ST200","ST5","ST59","ST83","ST114","ST297","ST384","ST14","ST89","ST210","-","ST328","ST331","ST73","ST2","ST88","ST100","ST10","ST290","ST87","ST23","ST218","ST329","ST19","ST225","ST170","Unclassified")
#colnames(count_table) <- c("A2.1","A2.2","A2.3","A3.1","A3.2","A3.3","A4.1","A4.2","A4.3","A4.4","A5.1","A5.2","A5.3","A5.4","A5.5","A5.6","A5.7","A10.1","A10.2","A10.3","A10.4","A17.1","A17.2","A17.3","A21.1","A21.2","A21.3","A22.1","A22.2","A22.3","A24.1","A24.2","A24.3","A25.1","A25.2","A25.3","A27.1","A27.2","A27.3","A28.1","A28.2","A28.3","LM.Nose","LM.Foot","LZ.Foot","LZ.Nose","NG.Foot","NG.Nose","VK.Foot","VK.Nose","AK.Foot","AK.Nose","MS.Foot","MS.Nose","AH.Nose","AH.Foot","AY_Nose","AY.Foot","JS.Nose","JS.Foot","PC.Nose","PC.Foot","SB.Nose","SB.Foot")
#col_order <- c("A2.1","A2.2","A2.3","A3.1","A3.2","A3.3","A4.1","A4.2","A4.3","A4.4","A5.1","A5.2","A5.3","A5.4","A5.5","A5.6","A5.7","A10.1","A10.2","A10.3","A10.4","A17.1","A17.2","A17.3","A21.1","A21.2","A21.3","A22.1","A22.2","A22.3","A24.1","A24.2","A24.3","A25.1","A25.2","A25.3","A27.1","A27.2","A27.3","A28.1","A28.2","A28.3", "LM.Nose","LM.Foot", "LZ.Nose","LZ.Foot", "NG.Nose","NG.Foot", "VK.Nose","VK.Foot", "AK.Nose","AK.Foot", "MS.Nose","MS.Foot", "AH.Nose","AH.Foot", "AY_Nose","AY.Foot", "JS.Nose","JS.Foot", "PC.Nose","PC.Foot", "SB.Nose","SB.Foot")
#count_table_reordered <- count_table[,col_order]
count_table_reordered <- count_table
write.csv(file="count_table.txt", count_table_reordered)
#NOTE to change rowname from '-' to 'Novel'
p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
#p = make_barplot_epidome(count_table_reordered,reorder=TRUE,normalize=TRUE)
png("Barplot_All.png", width=1600, height=900)
p
dev.off()
#Image3
eo_clinical = prune_by_variable_epidome(epidome_object,"sample.type",c("swab"))
#eo_Aachen = prune_by_variable_epidome(epidome_object,"sample.type",c("Aachen"))
epidome_object_clinical_norm = normalize_epidome_object(eo_clinical) ### Normalize counts to percent
#epidome_object_Aachen_norm = normalize_epidome_object(eo_Aachen)
png("PCA_by_patientID.png", width=1200, height=800)
PCA_patient_colored = plot_PCA_epidome(eo_filtered,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
PCA_patient_colored + ggtitle("PCA plot of all samples colored by patient ID")
dev.off()
png("PCA_Clinical_by_patientID.png", width=1200, height=800)
PCA_patient_colored = plot_PCA_epidome(epidome_object_clinical_norm,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
PCA_patient_colored + ggtitle("PCA plot of clinical samples colored by patient ID")
dev.off()
#png("PCA_Aachen_by_patientID.png", width=1200, height=800)
#PCA_sample_site_colored = plot_PCA_epidome(epidome_object_Aachen_norm,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
#PCA_sample_site_colored + ggtitle("PCA plot of nose and foot samples colored by patient ID")
#dev.off()
#png("PCA_Aachen_by_sampleSite.png", width=1200, height=800)
#PCA_sample_site_colored = plot_PCA_epidome(epidome_object_Aachen_norm,color_variable = "sample.site",colors = c("Red","Blue"),plot_ellipse = FALSE)
#PCA_sample_site_colored + ggtitle("PCA plot of nose and foot samples colored by sampling site")
#dev.off()
#Image4
eo_filter_lowcount = filter_lowcount_samples_epidome(epidome_object,p1_threshold = 500,p2_threshold = 500)
#-->[1] "1 low count samples removed from data: RP.Nose"
eo_filter_ASVs = epidome_filtered_ASVs = filter_lowcount_ASVs_epidome(epidome_object,percent_threshold = 1)
epidome_object_normalized = normalize_epidome_object(epidome_object)
epidome_object_ASV_combined = combine_ASVs_epidome(epidome_object)
epidome_object_clinical = prune_by_variable_epidome(epidome_object,variable_name = "sample.type",variable_values = c("swab"))
#epidome_object_Aachen= prune_by_variable_epidome(epidome_object,variable_name = "sample.type",variable_values = c("Aachen"))
eo_ASV_combined = combine_ASVs_epidome(epidome_object_clinical)
count_table_reordered = classify_epidome(eo_ASV_combined,ST_amplicon_table)
p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
png("Barplot_Clinical.png", width=1200, height=800)
p
dev.off()
#eo_ASV_combined = combine_ASVs_epidome(epidome_object_Aachen)
#count_table_reordered = classify_epidome(eo_ASV_combined,ST_amplicon_table)
#colnames(count_table_reordered) <- c("LM.Nose","LM.Foot","LZ.Nose","LZ.Foot","NG.Nose","NG.Foot","VK.Nose","VK.Foot","AK.Nose","AK.Foot","MS.Nose","MS.Foot","AH.Nose","AH.Foot","AY_Nose","AY.Foot","JS.Nose","JS.Foot","PC.Nose","PC.Foot","SB.Nose","SB.Foot")
#p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
#png("Barplot_Aachen.png", width=1200, height=800)
#p
#dev.off()
3.0. The methods for 16S
3.1. stitch
mkdir pandaseq.out
#-p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC
for file in cutadapted_16S/filtered_R1/*_R1.fastq.gz; do echo "pandaseq -f ${file} -r ${file/_R1/_R2} -l 300 -w pandaseq.out/$(echo $file | cut -d'/' -f3 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq"; done
pandaseq -f cutadapted_16S/filtered_R1/AH-LH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-LH_R2.fastq.gz -l 300 -w pandaseq.out/AH-LH_merged.fasta >> LOG_pandaseq
...
grep ">" AH-LH_merged.fasta | wc -l
...
jhuang@hamburg:~/DATA/Data_Luise_Epidome_batch3/core_diversity_e33778$ grep "AH.LH" biom_table_summary.txt
AH.LH: 75.566,000
...
3.2. create two QIIME mapping files
validate_mapping_file.py -m map2.txt
3.3. combine files into a labeled file
add_qiime_labels.py -i pandaseq.out -m map2_corrected.txt -c FileInput -o combined_fasta
3.4. remove chimeric sequences using usearch
cd combined_fasta
pyfasta split -n 100 combined_seqs.fna
for i in {000..099}; do echo "identify_chimeric_seqs.py -i combined_fasta/combined_seqs.fna.${i} -m usearch61 -o usearch_checked_combined.${i}/ -r ~/REFs/gg_97_otus_4feb2011_fw_rc.fasta --threads=14;" >> uchime_commands.sh; done
mv uchime_commands.sh ..
./uchime_commands.sh
cat usearch_checked_combined.000/chimeras.txt usearch_checked_combined.001/chimeras.txt usearch_checked_combined.002/chimeras.txt usearch_checked_combined.003/chimeras.txt usearch_checked_combined.004/chimeras.txt usearch_checked_combined.005/chimeras.txt usearch_checked_combined.006/chimeras.txt usearch_checked_combined.007/chimeras.txt usearch_checked_combined.008/chimeras.txt usearch_checked_combined.009/chimeras.txt usearch_checked_combined.010/chimeras.txt usearch_checked_combined.011/chimeras.txt usearch_checked_combined.012/chimeras.txt usearch_checked_combined.013/chimeras.txt usearch_checked_combined.014/chimeras.txt usearch_checked_combined.015/chimeras.txt usearch_checked_combined.016/chimeras.txt usearch_checked_combined.017/chimeras.txt usearch_checked_combined.018/chimeras.txt usearch_checked_combined.019/chimeras.txt usearch_checked_combined.020/chimeras.txt usearch_checked_combined.021/chimeras.txt usearch_checked_combined.022/chimeras.txt usearch_checked_combined.023/chimeras.txt usearch_checked_combined.024/chimeras.txt usearch_checked_combined.025/chimeras.txt usearch_checked_combined.026/chimeras.txt usearch_checked_combined.027/chimeras.txt usearch_checked_combined.028/chimeras.txt usearch_checked_combined.029/chimeras.txt usearch_checked_combined.030/chimeras.txt usearch_checked_combined.031/chimeras.txt usearch_checked_combined.032/chimeras.txt usearch_checked_combined.033/chimeras.txt usearch_checked_combined.034/chimeras.txt usearch_checked_combined.035/chimeras.txt usearch_checked_combined.036/chimeras.txt usearch_checked_combined.037/chimeras.txt usearch_checked_combined.038/chimeras.txt usearch_checked_combined.039/chimeras.txt usearch_checked_combined.040/chimeras.txt usearch_checked_combined.041/chimeras.txt usearch_checked_combined.042/chimeras.txt usearch_checked_combined.043/chimeras.txt usearch_checked_combined.044/chimeras.txt usearch_checked_combined.045/chimeras.txt usearch_checked_combined.046/chimeras.txt usearch_checked_combined.047/chimeras.txt usearch_checked_combined.048/chimeras.txt usearch_checked_combined.049/chimeras.txt usearch_checked_combined.050/chimeras.txt usearch_checked_combined.051/chimeras.txt usearch_checked_combined.052/chimeras.txt usearch_checked_combined.053/chimeras.txt usearch_checked_combined.054/chimeras.txt usearch_checked_combined.055/chimeras.txt usearch_checked_combined.056/chimeras.txt usearch_checked_combined.057/chimeras.txt usearch_checked_combined.058/chimeras.txt usearch_checked_combined.059/chimeras.txt usearch_checked_combined.060/chimeras.txt usearch_checked_combined.061/chimeras.txt usearch_checked_combined.062/chimeras.txt usearch_checked_combined.063/chimeras.txt usearch_checked_combined.064/chimeras.txt usearch_checked_combined.065/chimeras.txt usearch_checked_combined.066/chimeras.txt usearch_checked_combined.067/chimeras.txt usearch_checked_combined.068/chimeras.txt usearch_checked_combined.069/chimeras.txt usearch_checked_combined.070/chimeras.txt usearch_checked_combined.071/chimeras.txt usearch_checked_combined.072/chimeras.txt usearch_checked_combined.073/chimeras.txt usearch_checked_combined.074/chimeras.txt usearch_checked_combined.075/chimeras.txt usearch_checked_combined.076/chimeras.txt usearch_checked_combined.077/chimeras.txt usearch_checked_combined.078/chimeras.txt usearch_checked_combined.079/chimeras.txt usearch_checked_combined.080/chimeras.txt usearch_checked_combined.081/chimeras.txt usearch_checked_combined.082/chimeras.txt usearch_checked_combined.083/chimeras.txt usearch_checked_combined.084/chimeras.txt usearch_checked_combined.085/chimeras.txt usearch_checked_combined.086/chimeras.txt usearch_checked_combined.087/chimeras.txt usearch_checked_combined.088/chimeras.txt usearch_checked_combined.089/chimeras.txt usearch_checked_combined.090/chimeras.txt usearch_checked_combined.091/chimeras.txt usearch_checked_combined.092/chimeras.txt usearch_checked_combined.093/chimeras.txt usearch_checked_combined.094/chimeras.txt usearch_checked_combined.095/chimeras.txt usearch_checked_combined.096/chimeras.txt usearch_checked_combined.097/chimeras.txt usearch_checked_combined.098/chimeras.txt usearch_checked_combined.099/chimeras.txt > chimeras.txt
filter_fasta.py -f combined_fasta/combined_seqs.fna -o combined_fasta/combined_nonchimera_seqs.fna -s chimeras.txt -n;
rm -rf usearch_checked_combined.0*
grep ">AH.LH_" combined_nonchimera_seqs.fna | wc -l
...
3.5. create OTU picking parameter file, and run the QIIME open reference picking pipeline
echo "pick_otus:similarity 0.97" > clustering_params.txt
echo "assign_taxonomy:similarity 0.97" >> clustering_params.txt
echo "parallel_align_seqs_pynast:template_fp /home/jhuang/REFs/SILVA_132_QIIME_release/core_alignment/80_core_alignment.fna" >> clustering_params.txt
echo "assign_taxonomy:reference_seqs_fp /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna" >> clustering_params.txt
echo "assign_taxonomy:id_to_taxonomy_fp /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt" >> clustering_params.txt
echo "alpha_diversity:metrics chao1,observed_otus,shannon,PD_whole_tree" >> clustering_params.txt
#with usearch61 for reference picking and usearch61_ref for de novo OTU picking
pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i combined_fasta/combined_nonchimera_seqs.fna -o clustering/ -p clustering_params.txt --parallel
3.6. (optional), for control data
summarize_taxa_through_plots.py -i clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -o plots/taxa_summary -s
mv usearch_checked_combined usearch_checked_combined_ctrl
mv combined_fasta combined_fasta_ctrl
mv clustering clustering_ctrl
mv plots plots_ctrl
3.7. for other data: core diversity analyses
core_diversity_analyses.py -o./core_diversity_e33778 -i./clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -m./map2_corrected.txt -t./clustering/rep_set.tre -e33778 -p./clustering_params.txt
4.0. using R-code to summarize all results.
rmarkdown::render('Phyloseq.Rmd',output_file='Phyloseq.html')
点赞本文的读者
还没有人对此文章表态
没有评论
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
Typing of 81 S. epidermidis samples (Luise)
Co-Authorship Network Generator using scraped data from Google Scholar via SerpAPI
© 2023 XGenes.com Impressum