Epidome processing

gene_x 0 like s 873 view s

Tags: processing, NGS, pipeline

Raw_Data

#Here are some more information on the two sample collections for epidome sequencing: 9+10+33=52

#Samples 7N-15N are nose swabs of 9 individual patients, and 16-20F/N are feet (F) and nose (N) swabs of 5 more patients.

#All of these patients were hospitalized for endoprosthesis surgery.

#nose swaps of 9 individual patients
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1436/7N_S1_R1_001.fastq.gz P7-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1436/7N_S1_R2_001.fastq.gz P7-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1437/8N_S2_R1_001.fastq.gz P8-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1437/8N_S2_R2_001.fastq.gz P8-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1438/9N_S3_R1_001.fastq.gz P9-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1438/9N_S3_R2_001.fastq.gz P9-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1439/10N_S4_R1_001.fastq.gz P10-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1439/10N_S4_R2_001.fastq.gz P10-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1440/11N_S5_R1_001.fastq.gz P11-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1440/11N_S5_R2_001.fastq.gz P11-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1441/12N_S6_R1_001.fastq.gz P12-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1441/12N_S6_R2_001.fastq.gz P12-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1442/13N_S7_R1_001.fastq.gz P13-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1442/13N_S7_R2_001.fastq.gz P13-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1443/14N_S8_R1_001.fastq.gz P14-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1443/14N_S8_R2_001.fastq.gz P14-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1444/15N_S9_R1_001.fastq.gz P15-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1444/15N_S9_R2_001.fastq.gz P15-Nose_R2.fastq.gz

#16-20F/N are feet (F) and nose (N) swabs of 5 more patients 10
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1445/16F_S10_R1_001.fastq.gz P16-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1445/16F_S10_R2_001.fastq.gz P16-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1446/16N_S11_R1_001.fastq.gz P16-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1446/16N_S11_R2_001.fastq.gz P16-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1447/17F_S12_R1_001.fastq.gz P17-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1447/17F_S12_R2_001.fastq.gz P17-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1448/17N_S13_R1_001.fastq.gz P17-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1448/17N_S13_R2_001.fastq.gz P17-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1449/18F_S14_R1_001.fastq.gz P18-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1449/18F_S14_R2_001.fastq.gz P18-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1450/18N_S15_R1_001.fastq.gz P18-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1450/18N_S15_R2_001.fastq.gz P18-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1451/19F_S16_R1_001.fastq.gz P19-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1451/19F_S16_R2_001.fastq.gz P19-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1452/19N_S17_R1_001.fastq.gz P19-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1452/19N_S17_R2_001.fastq.gz P19-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1453/20F_S18_R1_001.fastq.gz P20-Foot_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1453/20F_S18_R2_001.fastq.gz P20-Foot_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1454/20N_S19_R1_001.fastq.gz P20-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1454/20N_S19_R2_001.fastq.gz P20-Nose_R2.fastq.gz

#Samples 1-108 are swabs of noses, lesioned skin (LH) and non-lesioned skin (NLH) of 11 patients suffering from atopic dermatitis. 33
#There are more details on these samples in the attached excel file.
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1455/1_S20_R1_001.fastq.gz RP-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1455/1_S20_R2_001.fastq.gz RP-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1456/2_S21_R1_001.fastq.gz RP-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1456/2_S21_R2_001.fastq.gz RP-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1457/3_S22_R1_001.fastq.gz RP-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1457/3_S22_R2_001.fastq.gz RP-NLH_R2.fastq.gz

ln -s ./230425_M03701_0301_000000000-KNW3N/hr1458/4_S23_R1_001.fastq.gz AL-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1458/4_S23_R2_001.fastq.gz AL-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1459/5_S24_R1_001.fastq.gz AL-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1459/5_S24_R2_001.fastq.gz AL-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1460/6_S25_R1_001.fastq.gz AL-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1460/6_S25_R2_001.fastq.gz AL-NLH_R2.fastq.gz

ln -s ./230425_M03701_0301_000000000-KNW3N/hr1461/22_S26_R1_001.fastq.gz MC-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1461/22_S26_R2_001.fastq.gz MC-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1462/23_S27_R1_001.fastq.gz MC-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1462/23_S27_R2_001.fastq.gz MC-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1463/24_S28_R1_001.fastq.gz MC-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1463/24_S28_R2_001.fastq.gz MC-NLH_R2.fastq.gz

ln -s ./230425_M03701_0301_000000000-KNW3N/hr1464/25_S29_R1_001.fastq.gz SA-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1464/25_S29_R2_001.fastq.gz SA-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1465/26_S30_R1_001.fastq.gz SA-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1465/26_S30_R2_001.fastq.gz SA-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1466/27_S31_R1_001.fastq.gz SA-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1466/27_S31_R2_001.fastq.gz SA-NLH_R2.fastq.gz

ln -s ./230425_M03701_0301_000000000-KNW3N/hr1467/28_S32_R1_001.fastq.gz HR-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1467/28_S32_R2_001.fastq.gz HR-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1468/29_S33_R1_001.fastq.gz HR-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1468/29_S33_R2_001.fastq.gz HR-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1469/30_S34_R1_001.fastq.gz HR-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1469/30_S34_R2_001.fastq.gz HR-NLH_R2.fastq.gz

ln -s ./230425_M03701_0301_000000000-KNW3N/hr1470/34_S35_R1_001.fastq.gz XN-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1470/34_S35_R2_001.fastq.gz XN-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1471/35_S36_R1_001.fastq.gz XN-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1471/35_S36_R2_001.fastq.gz XN-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1472/36_S37_R1_001.fastq.gz XN-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1472/36_S37_R2_001.fastq.gz XN-NLH_R2.fastq.gz

ln -s ./230425_M03701_0301_000000000-KNW3N/hr1487/AY_S52_R1_001.fastq.gz MR-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1487/AY_S52_R2_001.fastq.gz MR-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1473/50_S38_R1_001.fastq.gz MR-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1473/50_S38_R2_001.fastq.gz MR-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1474/51_S39_R1_001.fastq.gz MR-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1474/51_S39_R2_001.fastq.gz MR-NLH_R2.fastq.gz

ln -s ./230425_M03701_0301_000000000-KNW3N/hr1475/58_S40_R1_001.fastq.gz CB-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1475/58_S40_R2_001.fastq.gz CB-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1476/59_S41_R1_001.fastq.gz CB-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1476/59_S41_R2_001.fastq.gz CB-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1477/60_S42_R1_001.fastq.gz CB-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1477/60_S42_R2_001.fastq.gz CB-NLH_R2.fastq.gz

ln -s ./230425_M03701_0301_000000000-KNW3N/hr1478/94_S43_R1_001.fastq.gz KK-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1478/94_S43_R2_001.fastq.gz KK-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1479/95_S44_R1_001.fastq.gz KK-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1479/95_S44_R2_001.fastq.gz KK-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1480/96_S45_R1_001.fastq.gz KK-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1480/96_S45_R2_001.fastq.gz KK-NLH_R2.fastq.gz

ln -s ./230425_M03701_0301_000000000-KNW3N/hr1481/103_S46_R1_001.fastq.gz AH-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1481/103_S46_R2_001.fastq.gz AH-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1482/104_S47_R1_001.fastq.gz AH-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1482/104_S47_R2_001.fastq.gz AH-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1483/105_S48_R1_001.fastq.gz AH-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1483/105_S48_R2_001.fastq.gz AH-NLH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1484/106_S49_R1_001.fastq.gz PT2-Nose_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1484/106_S49_R2_001.fastq.gz PT2-Nose_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1485/107_S50_R1_001.fastq.gz PT2-LH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1485/107_S50_R2_001.fastq.gz PT2-LH_R2.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1486/108_S51_R1_001.fastq.gz PT2-NLH_R1.fastq.gz
ln -s ./230425_M03701_0301_000000000-KNW3N/hr1486/108_S51_R2_001.fastq.gz PT2-NLH_R2.fastq.gz

Processing for Epidome yycH and g216

#-->92-301
#yycH: 476 + 44      -->minLen=200 (76)
#g216: 448 + 44      -->minLen=185 (78)
#16S: 410-427 + 44   -->minLen=170 (70)
#360/2=180 *
#200 and 200 *
#>seq1;
#TGGGTATGGCAATCACTTTACA
#AGAATTCTATATTAAAGATGTTCTAATTGTGGAAAAGGGATCCATCGGTCATTCATTTAAACATTGGCCTCTATCAACAAAGACCATCACACCATCATTTACAACTAATGGTTTTGGCATGCCAGATATGAATGCAATAGCTAAAGATACATCACCTGCCTTCACTTTCAATGAAGAACATTTGTCTGGAAATAATTACGCTCAATACATTTCATTAGTAGCTGAGCATTACAATCTAAATGTCAAAACAAATACCAATGTTTCACGTGTAACATACATAGATGGTATATATCATGTATCAACGGACTATGGTGTTTATACCGCAGATTATATATTTATAGCAACTGGAGACTATTCATTCCCATATCATCCTTTTTCATATGGACGTCATTACAGTGAGATTCGAGCGTTCACTCAATTAAACGGTGACGCCTTTACAATTATTGGA GGTAATGAGAGTGCTTTTGATGC

#>M03701:292:000000000-K9M88:1:1101:10277:1358:AAGAGGCA+TATCCTCT
#TGGGTATGRCAATCACTTTACA
#AGAATTCAATATTAAAGATGTTCTAATTGTTGAAAAGGGAACCATCGGTCATTCATTTAAACATTGGCCTCTATCAACAAAGACCATCACACCATCATTTACAACTAATGGTTTTGGCATGCCAGATATGAATGCAATAGCTAAAGATACATCACCTGCCTTCACTTTCAATGAAGAACATTTATCTGGAAAACGTTATGCTGAATACCTCTCACTAGTAGCTACGCATTACAATCTAAATGGCAAAACAAACACCAATGTTTCACGTGTAACATACATAGATGGTGTATATCATGTATCAACGGACTATGGTGTTTATACCGCAGATTATATATTTATAGCAACTGGAGACTATTCATTCCCATATCATCCTTTATCATATGGACGTCATTACAGTGAAATTCAAACATTCACTCAATTAAAAGGTGATGCTTTTACAATCATTGGT GGTAATGAGAGTGCTTTTGATGC

#DIR: ~/DATA/Data_Holger_Epidome/testrun2
#Input: epidome->/home/jhuang/Tools/epidome and rawdata

#Read in 37158 paired-sequences, output 31225 (84%) filtered paired-sequences.
#Read in 82145 paired-sequences, output 78594 (95.7%) filtered paired-sequences.
#-->
#Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R1/A10-1_R1.fastq.gz
#Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R2/A10-1_R2.fastq.gz
#Read in 37158 paired-sequences, output 35498 (95.5%) filtered paired-sequences.
#Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R1/A10-2_R1.fastq.gz
#Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R2/A10-2_R2.fastq.gz
#Read in 82145 paired-sequences, output 80918 (98.5%) filtered paired-sequences.
#
#Read in 46149 paired-sequences, output 22206 (48.1%) filtered paired-sequences.
#Read in 197875 paired-sequences, output 168942 (85.4%) filtered paired-sequences.
#Read in 230646 paired-sequences, output 201376 (87.3%) filtered paired-sequences.
#Read in 175759 paired-sequences, output 149823 (85.2%) filtered paired-sequences.
#Read in 147546 paired-sequences, output 128864 (87.3%) filtered paired-sequences.

2.1. quality controls (optional)

    #under testrun2 should have
    #BiocManager::install("dada2")
    library(dada2); packageVersion("dada2")
    path <- "/home/jhuang/DATA/Data_Luise_Epidome_batch3/raw_data" # CHANGE ME to the directory containing the fastq files after unzipping.
    list.files(path)

    # Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
    fnFs <- sort(list.files(path, pattern="_R1.fastq.gz", full.names = TRUE))
    fnRs <- sort(list.files(path, pattern="_R2.fastq.gz", full.names = TRUE))
    # Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
    sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
    png("quality_fnFs.png", width=800, height=800)
    plotQualityProfile(fnFs[1:2])
    dev.off()
    png("quality_fnRs.png", width=800, height=800)
    plotQualityProfile(fnRs[1:2])
    dev.off()

2.2. cutadapt instead of Trimmomatic (namely demultiplexing, see epidome/scripts/EPIDOME_yycH_cutadapt_loop.sh)

    #Output: cutadapted_yycH cutadapted_g216 cutadapted_16S
    #Script: epidome/scripts/EPIDOME_yycH_cutadapt_loop.sh

    #5′-CGATGCKAAAGTGCCGAATA-3′/5′-CTTCATTTAAGAAGCCACCWTGACT-3′  for yycH
    #5′-TGGGTATGRCAATCACTTTACA-3′/5′-GCATCAAAAGCACTCTCATTACC-3′  for g216
    #-p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC -l 300        for 16S
    mkdir cutadapted_yycH cutadapted_g216 cutadapted_16S
    cd raw_data
    #The default is --action=trim. With --action=retain, the read is trimmed, but the adapter sequence itself is not removed.
    for file in *_R1.fastq.gz; do
    cutadapt -e 0.06 -g CGATGCKAAAGTGCCGAATA -G CTTCATTTAAGAAGCCACCWTGACT --pair-filter=any -o ../cutadapted_yycH/${file} --paired-output ../cutadapted_yycH/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
    done
    for file in *_R1.fastq.gz; do
    cutadapt -e 0.06 -g TGGGTATGRCAATCACTTTACA -G GCATCAAAAGCACTCTCATTACC --pair-filter=any -o ../cutadapted_g216/${file} --paired-output ../cutadapted_g216/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
    done
    for file in *_R1.fastq.gz; do
    cutadapt -e 0.06 -g CCTACGGGNGGCWGCAG -G GACTACHVGGGTATCTAATCC --pair-filter=any -o ../cutadapted_16S/${file} --paired-output ../cutadapted_16S/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
    done

2.3. (IGNORED) regenerate filtered_R1 and filtered_R2 (under conda env qiime1 using pear) --> IGNORED, since we should use filterAndTrim from data2 in the next step!)

    #DEPRECATED: mkdir pandaseq_16S pandaseq_yycH pandaseq_g216
    #DEPRECATED: -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC 
    #DEPRECATED: for file in cutadapted_16S/*_R1.fastq.gz; do pandaseq -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -l 300  -w pandaseq_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq_16S; done

    #https://learnmetabarcoding.github.io/LearnMetabarcoding/processing/pair_merging.html#
    #conda install -c conda-forge -c bioconda -c defaults seqkit
    mkdir pear_yycH pear_g216 pear_16S
    for file in cutadapted_yycH/*_R1.fastq.gz; do pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 4 -q 26 -v 10 -o pear_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_yycH; done
    for file in cutadapted_g216/*_R1.fastq.gz; do pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 2 -q 26 -v 10 -o pear_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_g216; done
    for file in cutadapted_16S/*_R1.fastq.gz; do pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 2 -q 26 -v 10 -o pear_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_16S; done

    for file in cutadapted_yycH/*_R1.fastq.gz; do
    grep "@M0370" pear_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
    sed -i -e 's/@//g' cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
    cut -d' ' -f1 cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
    seqkit grep -f cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_yycH/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
    seqkit grep -f cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_yycH/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
    done
    #>>LOG_pear_yycH

    for file in cutadapted_g216/*_R1.fastq.gz; do
    grep "@M0370" pear_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
    sed -i -e 's/@//g' cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
    cut -d' ' -f1 cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
    seqkit grep -f cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_g216/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
    seqkit grep -f cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_g216/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
    done

    for file in cutadapted_16S/*_R1.fastq.gz; do
    grep "@M0370" pear_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
    sed -i -e 's/@//g' cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
    cut -d' ' -f1 cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
    seqkit grep -f cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_16S/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
    seqkit grep -f cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_16S/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
    done

2.4. filtering+trimming+merging+chimera-removing (VERY_IMPORTANT: under conda env r4-base)

    #Input: cutadapted_yycH, cutadapted_g216
    #Outputs: yycH[g216|16S]_seqtab_from_dada2.rds
    #         yycH[g216|16S]_seqtab_from_dada2.csv
    #         yycH[g216|16S]_seqtab_nochim.rds
    #         yycH[g216|16S]_seqtab_nochim.csv
    #         yycH[g216|16S]_seqtab_image.RData
    #         track_yycH[g216|16S].csv -->
    #The following scripts are modified from epidome/scripts/dada2_for_EPIDOME_yycH_runwise_pipeline.R
    #./my_EPIDOME_g216_runwise_pipeline.R    #minLen=185, with FILTERING, it does not work!
    #./my_EPIDOME_yycH_runwise_pipeline.R    #minLen=200, with FILTERING, it does not work!
    ./my_EPIDOME_g216_runwise_pipeline_.R > g216_runwise_pipeline_.LOG    #minLen=185. NO FILTERING ANY MORE, since the input are filtered_R1 and filtered_R2.
    ./my_EPIDOME_yycH_runwise_pipeline_.R > yycH_runwise_pipeline_.LOG    #minLen=200. NO FILTERING ANY MORE, since the input are filtered_R1 and filtered_R2.
    ./my_EPIDOME_16S_runwise_pipeline.R > 16S_runwise_pipeline.LOG    #minLen=170. IGNORED since the 16S reads will be processed separately as below.
    #END

    #Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AH-LH_R1.fastq.gz
    #Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AH-LH_R2.fastq.gz
    #Read in 90261 paired-sequences, output 89026 (98.6%) filtered paired-sequences.
    #Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AH-NLH_R1.fastq.gz
    #Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AH-NLH_R2.fastq.gz
    #Read in 74638 paired-sequences, output 73633 (98.7%) filtered paired-sequences.
    #Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AH-Nose_R1.fastq.gz
    #Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AH-Nose_R2.fastq.gz
    #Read in 71311 paired-sequences, output 70542 (98.9%) filtered paired-sequences.
    #Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AL-LH_R1.fastq.gz
    #Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AL-LH_R2.fastq.gz
    #Read in 94807 paired-sequences, output 93082 (98.2%) filtered paired-sequences.

    ~/Tools/csv2xls-0.4/csv_to_xls.py g216_track.csv yycH_track.csv 16S_track.csv -d$';' -o overview.xls;

    # Read the CSV file into a DataFrame
    df <- read.csv("g216_seqtab_from_dada2_nohead.csv", sep=";", row.name=1, header=FALSE)
    #df <- read.csv("g216_seqtab_nochim.csv", sep=";", row.name=1)
    # Calculate the sum for each row
    row_sums <- rowSums(df)
    # Print the row sums
    print(row_sums)

    > print(row_sums)
      AH-LH   AH-NLH  AH-Nose    AL-LH   AL-NLH  AL-Nose    CB-LH   CB-NLH 
      87232    69741    89689    76660    94636   108810    73814    56312 
    CB-Nose    HR-LH   HR-NLH  HR-Nose    KK-LH   KK-NLH  KK-Nose    MC-LH 
      61740    76216    63165    55479    78550    87579    83826    73738 
      MC-NLH  MC-Nose    MR-LH   MR-NLH  MR-Nose P10-Nose P11-Nose P12-Nose 
      100338    94956    88158    63054    82361   103782   108533    90398 
    P13-Nose P14-Nose P15-Nose P16-Foot P16-Nose P17-Foot P17-Nose P18-Foot 
      87059    95656   110207    67058    77606    58339    95407    87775 
    P18-Nose P19-Foot P19-Nose P20-Foot P20-Nose  P7-Nose  P8-Nose  P9-Nose 
      107560    79373    99571   104667   109457   101528    99565   147485 
      PT2-LH  PT2-NLH PT2-Nose    RP-LH   RP-NLH  RP-Nose    SA-LH   SA-NLH 
      81074    89345    77121    68946    71414   113722    53465    40966 
    SA-Nose    XN-LH   XN-NLH  XN-Nose 
      33381    73842    76028    80630

      AH-LH   AH-NLH  AH-Nose    AL-LH   AL-NLH  AL-Nose    CB-LH   CB-NLH 
      79615    52143    57780    70801    89636    99428    52185    54848 
    CB-Nose    HR-LH   HR-NLH  HR-Nose    KK-LH   KK-NLH  KK-Nose    MC-LH 
      50249    58365    45201    38747    56027    65755    53110    72355 
      MC-NLH  MC-Nose    MR-LH   MR-NLH  MR-Nose P10-Nose P11-Nose P12-Nose 
      97363    69881    68599    52386    48364    84634    78491    71877 
    P13-Nose P14-Nose P15-Nose P16-Foot P16-Nose P17-Foot P17-Nose P18-Foot 
      81694    84290   100606    62621    66484    50015    93498    73730 
    P18-Nose P19-Foot P19-Nose P20-Foot P20-Nose  P7-Nose  P8-Nose  P9-Nose 
      93713    63755    70420    81030   105601    94352    70765   124449 
      PT2-LH  PT2-NLH PT2-Nose    RP-LH   RP-NLH  RP-Nose    SA-LH   SA-NLH 
      56869    74951    62339    68726    71145   111279    47754    36180 
    SA-Nose    XN-LH   XN-NLH  XN-Nose 
      26130    55647    69993    50727

    #wc -l cutadapted_yycH/filtered_R1$ vim Extraction-control-2_R1.fastq.gz #-->2696
    #Read in 1138 paired-sequences, output 674 (59.2%) filtered paired-sequences.
    #"Extraction-control-2";0;61;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;591;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 -->600 sequences
    #"Extraction-control-2";0;61;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;591;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0  #-->after chimera-removing, only 107 sequences
    #Processing: Extraction-control-2
    #Sample 1 - 674 reads in 209 unique sequences (What does the unique sequences mean???).  #merged sequences are 209
    #Sample 1 - 674 reads in 280 unique sequences.
    #"";"input_read_count"*; "filtered_and_trimmed_read_count";"merged_after_dada2_read_count"; "non-chimeric_read_count"*
    #"Extraction-control-2"; 1138*; 674;652; 652*

    #biom convert -i table.txt -o table.from_txt_json.biom --table-type="OTU table" --to-json
    #summarize_taxa_through_plots.py -i clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -o plots/taxa_summary -s
    #summarize_taxa.py -i otu_table.biom -o ./tax

    #g216_track.csv
    "A10-1";36009;493;367;367
    "A10-2";175221;82727;30264;27890
    "A10-3";110170;36812;13715;13065
    "A10-4";142323;64398;24306;21628

    #yycH_track.csv
    "A10-1";37158;549;0;0     pandaseq:36791
    "A10-2";82145;23953;6180;5956
    "A10-3";53438;12480;2944;2944
    "A10-4";64516;18361;12350;11797

    #16S_track.csv
    "A10-1";46149(in cutadapted_16S);13(in filtered_R1);8;8
    "A10-2";197875;2218;1540;1391
    "A10-3";230646;2429;1819;1752
    "A10-4";175759;2001;1439;1366

    #zcat A10-1_R1.fastq.gz | echo $((`wc -l`/4))
    #121007 > 37158 + 36009 + 46149 = 119316
    #458392 > 82145 + 175221 + 197875 = 455241

2.5. (IGNORED) stitching and removing chimeras (see ~/DATA/Data_Holger_Epidome/epidome/scripts/Combine_and_Remove_Chimeras_yycH.R)

    #my_Combine_and_Remove_Chimeras_g216.R is a part of my_EPIDOME_yycH_runwise_pipeline.R (see lines 53-55) --> IGNORED!

2.6. Classification: epidome/scripts/ASV_blast_classification.py

    #Input: g216_seqtab_nochim.csv using DATABASE epidome/DB/g216_ref_aln.fasta
    #Output: g216_seqtab_ASV_seqs.fasta, g216_seqtab_ASV_blast.txt and g216_seqtab.csv.classified.csv

    python3 epidome/scripts/ASV_blast_classification.py yycH_seqtab_nochim.csv yycH_seqtab_ASV_seqs.fasta  epidome/DB/yycH_ref_aln.fasta  yycH_seqtab_ASV_blast.txt yycH_seqtab.csv.classified.csv 99.5
    python3 epidome/scripts/ASV_blast_classification.py g216_seqtab_nochim.csv g216_seqtab_ASV_seqs.fasta  epidome/DB/g216_ref_aln.fasta  g216_seqtab_ASV_blast.txt g216_seqtab.csv.classified.csv 99.5

    #old: python3 epidome/scripts/ASV_blast_classification.py   yycH_seqtab.csv yycH_seqtab.csv.ASV_seqs.fasta  epidome/DB/yycH_ref_aln.fasta yycH_seqtab.csv.ASV_blast.txt yycH_seqtab.csv.classified.csv 99.5
    #old: python3 epidome/scripts/ASV_blast_classification_combined.py -p1 190920_run1_yycH_seqtab_from_dada2.csv -p2 190920_run1_G216_seqtab_from_dada2.csv -p1_ref epidome/DB/yycH_ref_aln.fasta -p2_ref epidome/DB/g216_ref_aln.fasta

    ##rename "seqseq2" --> seq2
    #sed -i -e s/seq//g 190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt
    #sed -i -e s/seqseq/seq/g 190920_run1_yycH_seqtab_from_dada2.csv.classified.csv
    #diff 190920_run1_yycH_seqtab_from_dada2.csv.ASV_seqs.fasta epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.ASV_seqs.fasta
    #diff 190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt
    #diff 190920_run1_yycH_seqtab_from_dada2.csv.classified.csv epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.classified.csv
    ## WHY: 667 seqs in old calculation, but in our calculation only 108 seqs
    ## They took *_seqtab_from_dada2.csv, but we took *_seqtab_nochim.csv. (653 vs 108 records!)
    ##AAAT";"seq37,36";0;
    sed -i -e s/seq//g yycH_seqtab_ASV_blast.txt   #length=476
    sed -i -e s/seq//g g216_seqtab_ASV_blast.txt   #length=448
    #;-->""
    sed -i -e s/';'//g yycH_seqtab_ASV_blast.txt
    sed -i -e s/';'//g g216_seqtab_ASV_blast.txt
    sed -i -e s/seqseq/seq/g yycH_seqtab.csv.classified.csv
    sed -i -e s/seqseq/seq/g g216_seqtab.csv.classified.csv
    #;,seq --> ,seq
    #;"; --> ";
    sed -i -e s/";,seq"/",seq"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/";,seq"/",seq"/g g216_seqtab.csv.classified.csv
    sed -i -e s/";\";"/"\";"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/";\";"/"\";"/g g216_seqtab.csv.classified.csv

    #"ASV";"Seq_number";"even-mock3-1_S258_L001";"even-mock3-2_S282_L001";"even-mock3-3_S199_L001";"staggered-mock3-1_S270_L001";"staggered-mock3-2_S211_L001";"staggered-mock3-3_S223_L001"
    #"ASV";"Seq_number";"Extraction_control_1";"Extraction_control_2";"P01_nose_1";"P01_nose_2";"P01_skin_1";"P01_skin_2";"P02_nose_1";"P02_nose_2";"P02_skin_1";"P02_skin_2";"P03_nose_1";"P03_nose_2";"P03_skin_1";"P03_skin_2";"P04_nose_1";"P04_nose_2";"P04_skin_1";"P04_skin_2";"P05_nose_1";"P05_nose_2";"P05_skin_1";"P05_skin_2";"P06_nose_1";"P06_nose_2";"P06_skin_1";"P06_skin_2";"P07_nose_1";"P07_nose_2";"P07_skin_1";"P07_skin_2";"P08_nose_1";"P08_nose_2";"P08_skin_1";"P08_skin_2";"P09_nose_1";"P09_nose_2";"P09_skin_1";"P09_skin_2";"P10_nose_1";"P10_nose_2";"P10_skin_1";"P10_skin_2";"P11_nose_1";"P11_nose_2";"P11_skin_1";"P11_skin_2";"even-mock3-1_S258_L001";"even-mock3-2_S282_L001";"even-mock3-3_S199_L001";"staggered-mock3-1_S270_L001";"staggered-mock3-2_S211_L001";"staggered-mock3-3_S223_L001"

    grep -v ";NA;" g216_seqtab.csv.classified.csv > g216_seqtab.csv.classified_noNA.csv
    grep -v ";NA;" yycH_seqtab.csv.classified.csv > yycH_seqtab.csv.classified_noNA.csv

    https://github.com/ssi-dk/epidome/blob/master/example_data/190920_run1_G216_seqtab_from_dada2.csv.classified.csv
    #DEBUG using LibreOffice, e.g. libreoffice --calc yycH_seqtab.csv.classified_noNA.csv after adding "ID"; at the corner.
    seq24,seq21 --> seq24,21

    #(OPTIONAL) TO reduce the unclassified, rename seq31,30 --> seq in g216,  seq37,36 --> seq in yycH.

2.7. draw plot from three amplicons: cutadapted_g216, cutadapted_yycH, and cutadapted_16S

    #Taxonomic database setup and classification
    #- Custom databases of all unique g216 and yycH target sequences can be found at https://github.com/ssi-dk/epidome/tree/master/DB. 
    #- We formatted our g216 and yycH gene databases to be compatible with DADA2’s assign-Taxonomy function and used it to classify the S. epidermidis ASVs with the RDP naive Bayesian classifier method (https://github.com/ssi-dk/epidome/tree/master/scripts).
    #- ST classification of samples was performed using the g216 target sequence as the primary identifier. 
    #- All g216 sequences unique to a single clonal cluster in the database were immediately classified as the matching clone, and in cases were the g216 sequence matched multiple clones, the secondary yycH target sequences were parsed to determine which clone was present. When this classification failed to resolve due to multiple potential combinations of sequences, ASVs were categorized as “Unclassified”. Similarly, g216 sequences not found in the database were labelled as “Novel”.

    #~/Tools/csv2xls-0.4/csv_to_xls.py g216_seqtab.csv.classified_noNA.csv yycH_seqtab.csv.classified_noNA.csv -d$'\t' -o counts.xls

    #under r4-base
    source("epidome/scripts/epidome_functions.R")

    ST_amplicon_table = read.table("epidome/DB/epidome_ST_amplicon_frequencies.txt",sep = "\t")
    #"ST"    "Group" "epi01_ASV"     "epi02_ASV"     "freq"
    #"8888"  "324"   113     1       1       2
    #"815097"        "5"     48      40      2       55
    #"846906"        "225"   61      3       3       1
    #"846960"        "-"     62      3       3       1
    #"847064"        "225"   63      3       3       2
    #"847222"        "225"   65      3       3       3
    #"865555"        "278"   37      5       3       4

    epi01_table = read.table("g216_seqtab.csv.classified_noNA.csv",sep = "\t",header=TRUE,row.names=1)
    epi02_table = read.table("yycH_seqtab.csv.classified_noNA.csv",sep = "\t",header=TRUE,row.names=1)
    #> sum(epi01_table$AH.LH)
    #[1] 78872
    #> sum(epi02_table$AH.LH)
    #[1] 86949

    #construct metadata.txt as follows.
    #"sample.ID"     "patient.ID"    "sample.site"   "sample.type"   "patient.sample.site"
    #P7.Nose P7      Nose    swab    P7.Nose.swab
    #P8.Nose P8      Nose    swab    P8.Nose.swab
    #P9.Nose P9      Nose    swab    P9.Nose.swab
    #P10.Nose        P10     Nose    swab    P10.Nose.swab
    #P11.Nose        P11     Nose    swab    P11.Nose.swab
    #P12.Nose        P12     Nose    swab    P12.Nose.swab
    #P13.Nose        P13     Nose    swab    P13.Nose.swab

    metadata_table = read.table("metadata.txt",header=TRUE,row.names=1)
    metadata_table$patient.ID <- factor(metadata_table$patient.ID, levels=c("P7","P8","P9","P10","P11","P12","P13","P14","P15","P16","P17","P18","P19","P20", "AH","AL","CB","HR","KK",  "MC","MR","PT2","RP","SA","XN"))
    epidome_object = setup_epidome_object(epi01_table,epi02_table,metadata_table = metadata_table)

    #Image1
    primer_compare = compare_primer_output(epidome_object,color_variable = "sample.type")
    png("image1.png")
    primer_compare$plot
    dev.off()

    eo_ASV_combined = combine_ASVs_epidome(epidome_object)
    eo_filtered = filter_lowcount_samples_epidome(eo_ASV_combined,500,500)

    count_table = classify_epidome(eo_ASV_combined,ST_amplicon_table)
    #count_df_ordered = count_table[order(rowSums(count_table),decreasing = T),]

    #install.packages("pls")
    #library(pls)
    #install.packages("reshape")
    #library(reshape)
    #install.packages("vegan")
    library('vegan') 
    library(scales)
    library(RColorBrewer)

    #Image2
    #TODO: find out what are the combination 21006 in Aachen?

    source("epidome/scripts/epidome_functions.R")
    #count_table = count_table[-29,]
    #row.names(count_table) <- c("-", "ST297", "ST170", "ST73", "ST225", "ST673", "ST215", "ST19", "Unclassified")
    #row.names(count_table) <- c("NA", "-", "X297", "X170", "X73", "X225", "X673", "X215", "X19", "Unclassified")
    row.names(count_table) <- c("ST215","ST130","ST278","ST200","ST5","ST59","ST83","ST114","ST297","ST384","ST14","ST89","ST210","-","ST328","ST331","ST73","ST2","ST88","ST100","ST10","ST290","ST87","ST23","ST218","ST329","ST19","ST225","ST170","Unclassified")
    #colnames(count_table) <- c("A2.1","A2.2","A2.3","A3.1","A3.2","A3.3","A4.1","A4.2","A4.3","A4.4","A5.1","A5.2","A5.3","A5.4","A5.5","A5.6","A5.7","A10.1","A10.2","A10.3","A10.4","A17.1","A17.2","A17.3","A21.1","A21.2","A21.3","A22.1","A22.2","A22.3","A24.1","A24.2","A24.3","A25.1","A25.2","A25.3","A27.1","A27.2","A27.3","A28.1","A28.2","A28.3","LM.Nose","LM.Foot","LZ.Foot","LZ.Nose","NG.Foot","NG.Nose","VK.Foot","VK.Nose","AK.Foot","AK.Nose","MS.Foot","MS.Nose","AH.Nose","AH.Foot","AY_Nose","AY.Foot","JS.Nose","JS.Foot","PC.Nose","PC.Foot","SB.Nose","SB.Foot")
    #col_order <- c("A2.1","A2.2","A2.3","A3.1","A3.2","A3.3","A4.1","A4.2","A4.3","A4.4","A5.1","A5.2","A5.3","A5.4","A5.5","A5.6","A5.7","A10.1","A10.2","A10.3","A10.4","A17.1","A17.2","A17.3","A21.1","A21.2","A21.3","A22.1","A22.2","A22.3","A24.1","A24.2","A24.3","A25.1","A25.2","A25.3","A27.1","A27.2","A27.3","A28.1","A28.2","A28.3", "LM.Nose","LM.Foot", "LZ.Nose","LZ.Foot", "NG.Nose","NG.Foot", "VK.Nose","VK.Foot", "AK.Nose","AK.Foot", "MS.Nose","MS.Foot", "AH.Nose","AH.Foot", "AY_Nose","AY.Foot", "JS.Nose","JS.Foot", "PC.Nose","PC.Foot", "SB.Nose","SB.Foot")
    #count_table_reordered <- count_table[,col_order]
    count_table_reordered <- count_table

    write.csv(file="count_table.txt", count_table_reordered)
    #NOTE to change rowname from '-' to 'Novel'

    p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
    #p = make_barplot_epidome(count_table_reordered,reorder=TRUE,normalize=TRUE)
    png("Barplot_All.png", width=1600, height=900)
    p
    dev.off()

    #Image3
    eo_clinical = prune_by_variable_epidome(epidome_object,"sample.type",c("swab"))
    #eo_Aachen = prune_by_variable_epidome(epidome_object,"sample.type",c("Aachen"))

    epidome_object_clinical_norm = normalize_epidome_object(eo_clinical) ### Normalize counts to percent
    #epidome_object_Aachen_norm = normalize_epidome_object(eo_Aachen)

    png("PCA_by_patientID.png", width=1200, height=800)
    PCA_patient_colored = plot_PCA_epidome(eo_filtered,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
    PCA_patient_colored + ggtitle("PCA plot of all samples colored by patient ID")
    dev.off()
    png("PCA_Clinical_by_patientID.png", width=1200, height=800)
    PCA_patient_colored = plot_PCA_epidome(epidome_object_clinical_norm,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
    PCA_patient_colored + ggtitle("PCA plot of clinical samples colored by patient ID")
    dev.off()
    #png("PCA_Aachen_by_patientID.png", width=1200, height=800)
    #PCA_sample_site_colored = plot_PCA_epidome(epidome_object_Aachen_norm,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
    #PCA_sample_site_colored + ggtitle("PCA plot of nose and foot samples colored by patient ID")
    #dev.off()
    #png("PCA_Aachen_by_sampleSite.png", width=1200, height=800)
    #PCA_sample_site_colored = plot_PCA_epidome(epidome_object_Aachen_norm,color_variable = "sample.site",colors = c("Red","Blue"),plot_ellipse = FALSE)
    #PCA_sample_site_colored + ggtitle("PCA plot of nose and foot samples colored by sampling site")
    #dev.off()

    #Image4
    eo_filter_lowcount = filter_lowcount_samples_epidome(epidome_object,p1_threshold = 500,p2_threshold = 500)
    #-->[1] "1 low count samples removed from data: RP.Nose"
    eo_filter_ASVs = epidome_filtered_ASVs = filter_lowcount_ASVs_epidome(epidome_object,percent_threshold = 1)
    epidome_object_normalized = normalize_epidome_object(epidome_object)
    epidome_object_ASV_combined = combine_ASVs_epidome(epidome_object)
    epidome_object_clinical = prune_by_variable_epidome(epidome_object,variable_name = "sample.type",variable_values = c("swab"))
    #epidome_object_Aachen= prune_by_variable_epidome(epidome_object,variable_name = "sample.type",variable_values = c("Aachen"))

    eo_ASV_combined = combine_ASVs_epidome(epidome_object_clinical)
    count_table_reordered = classify_epidome(eo_ASV_combined,ST_amplicon_table)
    p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
    png("Barplot_Clinical.png", width=1200, height=800)
    p
    dev.off()

    #eo_ASV_combined = combine_ASVs_epidome(epidome_object_Aachen)
    #count_table_reordered = classify_epidome(eo_ASV_combined,ST_amplicon_table)
    #colnames(count_table_reordered) <- c("LM.Nose","LM.Foot","LZ.Nose","LZ.Foot","NG.Nose","NG.Foot","VK.Nose","VK.Foot","AK.Nose","AK.Foot","MS.Nose","MS.Foot","AH.Nose","AH.Foot","AY_Nose","AY.Foot","JS.Nose","JS.Foot","PC.Nose","PC.Foot","SB.Nose","SB.Foot")
    #p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
    #png("Barplot_Aachen.png", width=1200, height=800)
    #p
    #dev.off()

3.0. The methods for 16S

3.1. stitch

    mkdir pandaseq.out
    #-p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC
    for file in cutadapted_16S/filtered_R1/*_R1.fastq.gz; do echo "pandaseq -f ${file} -r ${file/_R1/_R2} -l 300 -w pandaseq.out/$(echo $file | cut -d'/' -f3 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq"; done

    pandaseq -f cutadapted_16S/filtered_R1/AH-LH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-LH_R2.fastq.gz -l 300 -w pandaseq.out/AH-LH_merged.fasta >> LOG_pandaseq
    ...

    grep ">" AH-LH_merged.fasta | wc -l
    ...

    jhuang@hamburg:~/DATA/Data_Luise_Epidome_batch3/core_diversity_e33778$ grep "AH.LH" biom_table_summary.txt
    AH.LH: 75.566,000
    ...

3.2. create two QIIME mapping files

    validate_mapping_file.py -m map2.txt

3.3. combine files into a labeled file

    add_qiime_labels.py -i pandaseq.out -m map2_corrected.txt -c FileInput -o combined_fasta

3.4. remove chimeric sequences using usearch

    cd combined_fasta
    pyfasta split -n 100 combined_seqs.fna
    for i in {000..099}; do echo "identify_chimeric_seqs.py -i combined_fasta/combined_seqs.fna.${i} -m usearch61 -o usearch_checked_combined.${i}/ -r ~/REFs/gg_97_otus_4feb2011_fw_rc.fasta --threads=14;" >> uchime_commands.sh; done
    mv uchime_commands.sh ..
    ./uchime_commands.sh

    cat usearch_checked_combined.000/chimeras.txt usearch_checked_combined.001/chimeras.txt usearch_checked_combined.002/chimeras.txt usearch_checked_combined.003/chimeras.txt usearch_checked_combined.004/chimeras.txt usearch_checked_combined.005/chimeras.txt usearch_checked_combined.006/chimeras.txt usearch_checked_combined.007/chimeras.txt usearch_checked_combined.008/chimeras.txt usearch_checked_combined.009/chimeras.txt usearch_checked_combined.010/chimeras.txt usearch_checked_combined.011/chimeras.txt usearch_checked_combined.012/chimeras.txt usearch_checked_combined.013/chimeras.txt usearch_checked_combined.014/chimeras.txt usearch_checked_combined.015/chimeras.txt usearch_checked_combined.016/chimeras.txt usearch_checked_combined.017/chimeras.txt usearch_checked_combined.018/chimeras.txt usearch_checked_combined.019/chimeras.txt usearch_checked_combined.020/chimeras.txt usearch_checked_combined.021/chimeras.txt usearch_checked_combined.022/chimeras.txt usearch_checked_combined.023/chimeras.txt usearch_checked_combined.024/chimeras.txt usearch_checked_combined.025/chimeras.txt usearch_checked_combined.026/chimeras.txt usearch_checked_combined.027/chimeras.txt usearch_checked_combined.028/chimeras.txt usearch_checked_combined.029/chimeras.txt usearch_checked_combined.030/chimeras.txt usearch_checked_combined.031/chimeras.txt usearch_checked_combined.032/chimeras.txt usearch_checked_combined.033/chimeras.txt usearch_checked_combined.034/chimeras.txt usearch_checked_combined.035/chimeras.txt usearch_checked_combined.036/chimeras.txt usearch_checked_combined.037/chimeras.txt usearch_checked_combined.038/chimeras.txt usearch_checked_combined.039/chimeras.txt usearch_checked_combined.040/chimeras.txt usearch_checked_combined.041/chimeras.txt usearch_checked_combined.042/chimeras.txt usearch_checked_combined.043/chimeras.txt usearch_checked_combined.044/chimeras.txt usearch_checked_combined.045/chimeras.txt usearch_checked_combined.046/chimeras.txt usearch_checked_combined.047/chimeras.txt usearch_checked_combined.048/chimeras.txt usearch_checked_combined.049/chimeras.txt usearch_checked_combined.050/chimeras.txt usearch_checked_combined.051/chimeras.txt usearch_checked_combined.052/chimeras.txt usearch_checked_combined.053/chimeras.txt usearch_checked_combined.054/chimeras.txt usearch_checked_combined.055/chimeras.txt usearch_checked_combined.056/chimeras.txt usearch_checked_combined.057/chimeras.txt usearch_checked_combined.058/chimeras.txt usearch_checked_combined.059/chimeras.txt usearch_checked_combined.060/chimeras.txt usearch_checked_combined.061/chimeras.txt usearch_checked_combined.062/chimeras.txt usearch_checked_combined.063/chimeras.txt usearch_checked_combined.064/chimeras.txt usearch_checked_combined.065/chimeras.txt usearch_checked_combined.066/chimeras.txt usearch_checked_combined.067/chimeras.txt usearch_checked_combined.068/chimeras.txt usearch_checked_combined.069/chimeras.txt usearch_checked_combined.070/chimeras.txt usearch_checked_combined.071/chimeras.txt usearch_checked_combined.072/chimeras.txt usearch_checked_combined.073/chimeras.txt usearch_checked_combined.074/chimeras.txt usearch_checked_combined.075/chimeras.txt usearch_checked_combined.076/chimeras.txt usearch_checked_combined.077/chimeras.txt usearch_checked_combined.078/chimeras.txt usearch_checked_combined.079/chimeras.txt usearch_checked_combined.080/chimeras.txt usearch_checked_combined.081/chimeras.txt usearch_checked_combined.082/chimeras.txt usearch_checked_combined.083/chimeras.txt usearch_checked_combined.084/chimeras.txt usearch_checked_combined.085/chimeras.txt usearch_checked_combined.086/chimeras.txt usearch_checked_combined.087/chimeras.txt usearch_checked_combined.088/chimeras.txt usearch_checked_combined.089/chimeras.txt usearch_checked_combined.090/chimeras.txt usearch_checked_combined.091/chimeras.txt usearch_checked_combined.092/chimeras.txt usearch_checked_combined.093/chimeras.txt usearch_checked_combined.094/chimeras.txt usearch_checked_combined.095/chimeras.txt usearch_checked_combined.096/chimeras.txt usearch_checked_combined.097/chimeras.txt usearch_checked_combined.098/chimeras.txt usearch_checked_combined.099/chimeras.txt > chimeras.txt
    filter_fasta.py -f combined_fasta/combined_seqs.fna -o combined_fasta/combined_nonchimera_seqs.fna -s chimeras.txt -n;
    rm -rf usearch_checked_combined.0*

    grep ">AH.LH_" combined_nonchimera_seqs.fna | wc -l
    ...

3.5. create OTU picking parameter file, and run the QIIME open reference picking pipeline

    echo "pick_otus:similarity 0.97" > clustering_params.txt
    echo "assign_taxonomy:similarity 0.97" >> clustering_params.txt
    echo "parallel_align_seqs_pynast:template_fp /home/jhuang/REFs/SILVA_132_QIIME_release/core_alignment/80_core_alignment.fna" >> clustering_params.txt
    echo "assign_taxonomy:reference_seqs_fp /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna" >> clustering_params.txt
    echo "assign_taxonomy:id_to_taxonomy_fp /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt" >> clustering_params.txt
    echo "alpha_diversity:metrics chao1,observed_otus,shannon,PD_whole_tree" >> clustering_params.txt
    #with usearch61 for reference picking and usearch61_ref for de novo OTU picking
    pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i combined_fasta/combined_nonchimera_seqs.fna -o clustering/ -p clustering_params.txt --parallel

3.6. (optional), for control data

    summarize_taxa_through_plots.py -i clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -o plots/taxa_summary -s
    mv usearch_checked_combined usearch_checked_combined_ctrl
    mv combined_fasta combined_fasta_ctrl
    mv clustering clustering_ctrl
    mv plots plots_ctrl

3.7. for other data: core diversity analyses

    core_diversity_analyses.py -o./core_diversity_e33778 -i./clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -m./map2_corrected.txt -t./clustering/rep_set.tre -e33778 -p./clustering_params.txt

4.0. using R-code to summarize all results.

rmarkdown::render('Phyloseq.Rmd',output_file='Phyloseq.html')

Phyloseq.Rmd

Phyloseq.html

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

Epidome processing

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章