From epidome to treeHeatmap

gene_x 0 like s 643 view s

Tags: plot, python, R, pipeline

  1. (namely epidome-processing step 2.6) Classification using 98% similarity.

    #Input Files: yycH_seqtab_nochim.csv and ../epidome/DB/yycH_ref_aln.fasta
    #Output Files: yycH_seqtab_ASV_seqs.fasta, yycH_seqtab_ASV_blast.txt, and yycH_seqtab.csv.classified.csv
    
    python3 ../epidome/scripts/ASV_blast_classification.py yycH_seqtab_nochim.csv yycH_seqtab_ASV_seqs.fasta ../epidome/DB/yycH_ref_aln.fasta yycH_seqtab_ASV_blast.txt yycH_seqtab.csv.classified.csv 98
    python3 ../epidome/scripts/ASV_blast_classification.py g216_seqtab_nochim.csv g216_seqtab_ASV_seqs.fasta ../epidome/DB/g216_ref_aln.fasta g216_seqtab_ASV_blast.txt g216_seqtab.csv.classified.csv 98
    
  2. Manually check the reults, ensuring no seq34;seq35 occurs.

    #Delete all records in *_seqtab.csv.classified.csv the alignment length < 448 (delete records 110 and 111) in g216, or < 440 (del record 90) in yycH.
    
    #Confirm the results
    jhuang@hamburg:/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98$ cut -d$'\t' -f4 g216_seqtab_ASV_blast.txt | sort -u
    446
    447
    448
    jhuang@hamburg:/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98$ cut -d$'\t' -f4 yycH_seqtab_ASV_blast.txt | sort -u
    475
    476
    
    #It is not neccesary to delete correponding records 90 in yycH_* and g216_seqtab_ASV_blast.txt, since the two files are not used. 
    #sed -i -e s/seq//g yycH_seqtab_ASV_blast.txt   #length=476
    #sed -i -e s/seq//g g216_seqtab_ASV_blast.txt   #length=448
    ##;-->""
    #sed -i -e s/';'//g yycH_seqtab_ASV_blast.txt
    #sed -i -e s/';'//g g216_seqtab_ASV_blast.txt
    
    cp g216_seqtab.csv.classified.csv g216_seqtab.csv.classified.backup
    cp yycH_seqtab.csv.classified.csv yycH_seqtab.csv.classified.backup
    
    # (Option 1): Processing *.classified.csv, ensuring the format seq24,21 or seq24.
    #https://github.com/ssi-dk/epidome/blob/master/example_data/190920_run1_G216_seqtab_from_dada2.csv.classified.csv
    ##DEBUG using LibreOffice, e.g. libreoffice --calc yycH_seqtab.csv.classified_noNA.csv after adding "ID"; at the corner.
    #Final goal: "seqseq31;,seq30;" --> "seq31,30"
    #;,seq --> ,
    sed -i -e s/";,seq"/","/g g216_seqtab.csv.classified.csv
    sed -i -e s/";,seq"/","/g yycH_seqtab.csv.classified.csv
    #;"; --> ";
    sed -i -e s/";\";"/"\";"/g g216_seqtab.csv.classified.csv
    sed -i -e s/";\";"/"\";"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/seqseq/seq/g g216_seqtab.csv.classified.csv
    sed -i -e s/seqseq/seq/g yycH_seqtab.csv.classified.csv
    #confirm!
    cut -d';' -f3 g216_seqtab.csv.classified.csv
    cut -d';' -f3 yycH_seqtab.csv.classified.csv
    
    # (Option 2): To reduce the unclassified, rename seq31,seq30 --> seq31 in g216,  seq37,seq36 --> seq36 in yycH.
    sed -i -e s/"seq40,seq31,seq30"/"seq40"/g g216_seqtab.csv.classified.csv
    sed -i -e s/"seq31,seq30"/"seq31"/g g216_seqtab.csv.classified.csv
    sed -i -e s/"seq5,seq2"/"seq2"/g        sed -i -e s/"seq40,seq31,seq30"/"seq40"/g g216_seqtab.csv.classified.csv
    sed -i -e s/"seq31,seq30"/"seq31"/g g216_seqtab.csv.classified.csv
    sed -i -e s/"seq5,seq2"/"seq2"/g g216_seqtab.csv.classified.csv
    grep ",seq" g216_seqtab.csv.classified.csv  #should return no record.
    #sed -i -e s/"seq22,20"/"seq20"/g g216_seqtab.csv.classified.csv
    #sed -i -e s/"seq24,21"/"seq21"/g g216_seqtab.csv.classified.csv
    #sed -i -e s/"seq21,17"/"seq21"/g g216_seqtab.csv.classified.csv
    > epidome_object$p1_seqs    #vim g216_seqtab.csv.classified_noNA.csv
      [1] "seq28"       "seq5"        "seq40"       "seq31,30"(seq31)     "seq14"      
      [6] "seq20"       "seq22"       "seq26"       "seq37"       "seq21"      
    [11] "seq29"       "seq24"       "seq1"        "seq26"       "seq40"      
    [16] "seq21"       "seq18"       "seq40"       "seq16"       "seq37"      
    [21] "seq5,2"      "seq18"       "seq11"       "seq14"       "seq26"      
    [26] "seq31,30"(seq31)     "seq37"       "seq26"       "seq1"        "seq21"      
    [31] "seq3"        "seq40"       "seq26"       "seq28"       "seq40"      
    [36] "seq10"       "seq5"        "seq40,31,30"(seq40)  "seq37"       "seq37"      
    [41] "seq37"       "seq40"       "seq28"       "seq26"       "seq37"      
    [46] "seq22"       "seq28"       "seq19"       "seq5,2"      "seq26"      
    [51] "seq40"       "seq40"       "seq26"       "seq40"       "seq28"      
    [56] "seq28"       "seq28"       "seq37"       "seq26"       "seq40"      
    [61] "seq26"       "seq37"       "seq28"       "seq28"       "seq1"       
    [66] "seq28"       "seq5,2"(seq5)      "seq40"       "seq21"       "seq28"      
    [71] "seq29"       
    #"seq37"       "seq22"       "seq22,20"(seq20)    "seq31,30"(seq31)    
    [76] "seq22"       "seq20"       "seq28"       "seq37"       "seq14"      
    [81] "seq28"       "seq20"       "seq40"       "seq28"       "seq24,21" 
    [86] "seq40"       "seq14"       "seq31,30"(seq31)      "seq1"        "seq1"       
    [91] "seq31,30"(seq31)     "seq1"        "seq40"       "seq20"       "seq1"       
    [96] "seq21"       "seq26"       "seq20"       "seq20"       "seq21,17"   
    [101] "seq37"       "seq26"       "seq31,30"(seq31)      "seq31,30"(seq31)    "seq40"      
    [106] "seq5"        "seq12"
    
    sed -i -e s/"seq37,seq36"/"seq36"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/"seq58,seq25,seq21,seq10"/"seq21"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/"seq23,seq22"/"seq23"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/"seq65,seq63,seq61,seq60"/"seq63"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/"seq65,seq63,seq53"/"seq63"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/"seq61,seq55"/"seq55"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/"seq34,seq28,seq25,seq10,seq9"/"seq34"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/"seq25,seq10,seq9"/"seq9"/g yycH_seqtab.csv.classified.csv
    sed -i -e s/"seq25,seq10"/"seq10"/g yycH_seqtab.csv.classified.csv
    grep ",seq" yycH_seqtab.csv.classified.csv  #should return no record.
    > epidome_object$p2_seqs
    [1] "34"            "43"            "seq37,seq36"-->36         "42"           
    [5] "26"            "63"            "3"             "2"            
    [9] "11"            "seq65,seq63,seq53"-->63      "14"            "38"           
    [13] "55"            "3"             "65"            "seq58,seq25,seq21,seq10"-->21  
    [17] "seq23,seq22"-->23         "53"            "56"            "34"           
    [21] "62"            "14"            "35"            "seq61,seq55"-->55        
    [25] "52"            "seq25,seq10"-->10         "seq65,seq63,seq61,seq60"-->63   "21"           
    [29] "55"            "15"            "34"            "49"           
    [33] "19"            "63"            "63"            "34"           
    [37] "43"            "12"            "34"            "seq34,seq28,seq25,seq10,seq9"-->34
    [41] "43"            "61"            "25"            "34"           
    [45] "34"            "5"             "18"            "43"           
    [49] "62"            "34"            "19"            "63"           
    [53] "7"             
    --------------
    "seq37,seq36"-->36         "seq25,seq10,seq9"-->9       "56"           
    [57] "11"            "49"            "10"            "63"           
    [61] "seq37,seq36"-->36         "14"            "55"  
    #-------------
    "55"           
    [65] "14"            "37,36"         "43"            "45"           
    [69] "34"            "34"            "43"            "43"           
    [73] "25"            "14"            "65,64,61,56"   "15,9"         
    [77] "43"            "38"            "43"            "53"           
    [81] "11"            "9"             "34"            "15"           
    [85] "9"             "23,22"         "62"            "43"           
    [89] "37,36"
    
    # Common step after option 1 or option 2.
    grep -v ";NA;" g216_seqtab.csv.classified.csv > g216_seqtab.csv.classified_noNA.csv
    grep -v ";NA;" yycH_seqtab.csv.classified.csv > yycH_seqtab.csv.classified_noNA.csv
    
    #filtering the small ASV in which the number < 500: INPUT: *_noNA.csv; OUTPUT: *_seqtab.csv.classified_filtered.csv
    python3 g216_filtering.py  
        import pandas as pd
        # Load the file
        data = pd.read_csv('g216_seqtab.csv.classified_noNA.csv', delimiter=';')
        # Compute row sums excluding the first 2 columns
        data['row_sum'] = data.iloc[:, 2:].sum(axis=1)
        print(data.iloc[:, 2:])
        print(data.iloc[:, 2:].sum(axis=1))
        # Filter out rows where the sum is less than 100
        filtered_data = data[data['row_sum'] >= 500].drop(columns=['row_sum'])
        # Save the resulting dataframe to a new file
        filtered_data.to_csv('g216_seqtab.csv.classified_filtered.csv', sep=';', index=False)
    python3 yycH_filtering.py
    
  3. draw plot from three amplicons: cutadapted_g216, cutadapted_yycH

    #Taxonomic database setup and classification
    #- Custom databases of all unique g216 and yycH target sequences can be found at https://github.com/ssi-dk/epidome/tree/master/DB. 
    #- We formatted our g216 and yycH gene databases to be compatible with DADA2’s assign-Taxonomy function and used it to classify the S. epidermidis ASVs with the RDP naive Bayesian classifier method (https://github.com/ssi-dk/epidome/tree/master/scripts).
    #- ST classification of samples was performed using the g216 target sequence as the primary identifier. 
    #- All g216 sequences unique to a single clonal cluster in the database were immediately classified as the matching clone, and in cases were the g216 sequence matched multiple clones, the secondary yycH target sequences were parsed to determine which clone was present. When this classification failed to resolve due to multiple potential combinations of sequences, ASVs were categorized as “Unclassified”. Similarly, g216 sequences not found in the database were labelled as “Novel”.
    
    #install.packages("pls")
    #library(pls)
    #install.packages("reshape")
    #library(reshape)
    #install.packages("vegan")
    library('vegan') 
    library(scales)
    library(RColorBrewer)
    
    #~/Tools/csv2xls-0.4/csv_to_xls.py g216_seqtab.csv.classified_noNA.csv yycH_seqtab.csv.classified_noNA.csv -d$'\t' -o counts.xls
    
    setwd("/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98_2")
    #under r4-base
    source("../epidome/scripts/epidome_functions.R")
    
    ST_amplicon_table = read.table("../epidome/DB/epidome_ST_amplicon_frequencies.txt",sep = "\t")
    
    epi01_table = read.table("g216_seqtab.csv.classified_filtered.csv",sep = ";",header=TRUE,row.names=1)
    epi02_table = read.table("yycH_seqtab.csv.classified_filtered.csv",sep = ";",header=TRUE,row.names=1)
    #> sum(epi01_table$AH.LH)
    #[1] 78872
    #> sum(epi02_table$AH.LH)
    #[1] 86949
    
    metadata_table = read.table("../metadata.txt",header=TRUE,row.names=1)
    metadata_table$patient.ID <- factor(metadata_table$patient.ID, levels=c("P7","P8","P9","P10","P11","P12","P13","P14","P15","P16","P17","P18","P19","P20", "AH","AL","CB","HR","KK",  "MC","MR","PT2","RP","SA","XN"))
    
    #epidome_object = setup_epidome_object(epi01_table,epi02_table, metadata_table=metadata_table)
    primer1_table <- epi01_table
    primer2_table <- epi02_table
    #setup_epidome_object <- function(primer1_table,primer2_table,metadata_table) {
      #DEBUG: 3:ncol --> 2:ncol
      primer1_counts = primer1_table[,2:ncol(primer1_table)]
      primer2_counts = primer2_table[,2:ncol(primer2_table)]
      primer1_all_sample_names = colnames(primer1_counts)
      primer2_all_sample_names = colnames(primer2_counts)
      samples_with_both_primers = primer1_all_sample_names[which(primer1_all_sample_names %in% primer2_all_sample_names)]
      samples_missing_primer1_data = primer2_all_sample_names[which(!primer2_all_sample_names %in% primer1_all_sample_names)]
      samples_missing_primer2_data = primer1_all_sample_names[which(!primer1_all_sample_names %in% primer2_all_sample_names)]
      primer1_seqs = as.vector(primer1_table$Seq_number)
      primer2_seqs = as.vector(primer2_table$Seq_number)
      primer1_seqs[which(is.na(primer1_seqs))] = "seqUnclassified"
      primer2_seqs[which(is.na(primer2_seqs))] = "seqUnclassified"
      primer1_counts = primer1_table[,which(colnames(primer1_table) %in% samples_with_both_primers)]
      primer2_counts = primer2_table[,which(colnames(primer2_table) %in% samples_with_both_primers)]
      if (!missing(metadata_table)) {
        metadata_names = rownames(metadata_table)
        samples_missing_metadata = samples_with_both_primers[which(!samples_with_both_primers %in% metadata_names)]
        samples_missing_primer_data = metadata_names[which(!metadata_names %in% samples_with_both_primers)]
        include_names = metadata_names[which(metadata_names %in% samples_with_both_primers)]
        metadata_include = match(include_names,metadata_names)
        metadata_include = metadata_include[which(!is.na(metadata_include))]
        metadata_table = metadata_table[metadata_include,]
        #primer1_include = match(colnames(primer1_counts),include_names)
        primer1_include = match(include_names,colnames(primer1_counts))
        primer1_include = primer1_include[which(!is.na(primer1_include))]
        #primer2_include = match(colnames(primer2_counts),include_names)
        primer2_include = match(include_names,colnames(primer2_counts))
        primer2_include = primer2_include[which(!is.na(primer2_include))]
        epi1_table = primer1_counts[,primer1_include]
        epi2_table = primer2_counts[,primer2_include]
        epidome_object = list('p1_seqs'=primer1_seqs,'p1_table'=epi1_table,'p2_seqs'=primer2_seqs,'p2_table'=epi2_table,'metadata'=metadata_table,'sample_names'=include_names,'meta_variables'=colnames(metadata_table))
        print(paste0("Metadata loaded with ",length(metadata_names)," samples and ",ncol(metadata_table)," variables"))
        print(paste0(length(samples_missing_metadata)," samples are found in both primer sets but not in metadata: ",paste0(samples_missing_metadata,collapse = " ")))
        print(paste0(length(samples_missing_primer_data)," samples are found in metadata but is missing from one or both primer sets: ",paste0(samples_missing_primer_data,collapse = " ")))
        print(paste0(length(include_names)," samples are found in metadata and both tables and are included in epidome object"))
    
      } else {
        epi1_table = primer1_counts[,match(colnames(primer1_counts),samples_with_both_primers)]
        epi2_table = primer2_counts[,match(colnames(primer2_counts),samples_with_both_primers)]
        epidome_object = list('p1_seqs'=primer1_seqs,'p1_table'=epi1_table,'p2_seqs'=primer2_seqs,'p2_table'=epi2_table,'sample_names'=samples_with_both_primers)
        print(paste0("No metadata loaded"))
        print(paste0(length(samples_missing_primer2_data)," samples are found in p1 table but not in p2 table: ",paste0(samples_missing_primer2_data,collapse = " ")))
        print(paste0(length(samples_missing_primer1_data)," samples are found in p2 table but not in p1 table: ",paste0(samples_missing_primer1_data,collapse = " ")))
        print(paste0(length(samples_with_both_primers)," samples are found in both tables and are included in epidome object"))
      }
      #return(epidome_object)
    #}
    
    str(epidome_object)
    #List of 7
    # $ p1_seqs       : chr [1:57] "seq28" "seq5" "seq40" "seq31,30" ...
    # $ p1_table      :'data.frame':    57 obs. of  51 variables:
    # $ p2_seqs       : chr [1:53] "seq34" "seq43" "seq37,36" "seq42" ...
    # $ p2_table      :'data.frame':    53 obs. of  51 variables:
    # $ metadata      :'data.frame':    51 obs. of  4 variables:
    # $ sample_names  : chr [1:51] "P7.Nose" "P8.Nose" "P9.Nose" "P10.Nose" ...
    # $ meta_variables: chr [1:4] "patient.ID" "sample.site" "sample.type" "patient.sample.site"
    unique(sort(epidome_object$p1_seqs))  # 57
    #[1] "seq28"       "seq5"        "seq40"       "seq31,30"    "seq14"      
    #[6] "seq20"       "seq22"       "seq26"       "seq37"       "seq21"      
    #[11] "seq29"       "seq24"       "seq1"        "seq26"       "seq40"      
    #[16] "seq21"       "seq18"       "seq40"       "seq16"       "seq37"      
    #[21] "seq5,2"      "seq18"       "seq11"       "seq14"       "seq26"      
    #[26] "seq31,30"    "seq37"       "seq26"       "seq1"        "seq21"      
    #[31] "seq3"        "seq40"       "seq26"       "seq28"       "seq40"      
    #[36] "seq10"       "seq5"        "seq40,31,30" "seq37"       "seq37"      
    #[41] "seq37"       "seq40"       "seq28"       "seq26"       "seq37"      
    #[46] "seq22"       "seq28"       "seq19"       "seq5,2"      "seq26"      
    #[51] "seq40"       "seq40"       "seq26"       "seq40"       "seq28"      
    #[56] "seq28"       "seq28"
    unique(sort(epidome_object$p2_seqs))
    
    #Image1
    primer_compare = compare_primer_output(epidome_object,color_variable = "sample.type")
    png("image1.png")
    primer_compare$plot
    dev.off()
    
    eo_ASV_combined = combine_ASVs_epidome(epidome_object)
    #> str(eo_ASV_combined)
    #List of 7
    #$ p1_seqs       : chr [1:21] "seq28" "seq5" "seq40" "seq31,30" ...
    #$ p1_table      :'data.frame': 21 obs. of  51 variables:
    #> eo_ASV_combined$p1_seqs  #21
    #[1] "seq28"       "seq5"        "seq40"       "seq31,30"    "seq14"      
    #[6] "seq20"       "seq22"       "seq26"       "seq37"       "seq21"      
    #[11] "seq29"       "seq24"       "seq1"        "seq18"       "seq16"      
    #[16] "seq5,2"      "seq11"       "seq3"        "seq10"       "seq40,31,30"
    #[21] "seq19"
    #eo_filtered = filter_lowcount_samples_epidome(eo_ASV_combined,500,500)
    
    #Note that we run the following code instead run the method from the script due to modified code: 
    count_table_checked = classify_epidome(eo_ASV_combined,ST_amplicon_table)
    write.csv(count_table_checked, file="count_table_checked.csv")
    strict_classifier=FALSE
    #classify_epidome = function(epidome_object,ST_amplicon_table,strict_classifier=FALSE) {
    #Adapt the code to the true epi01 and epi02 values, please give the complete code:
    epidome_object_norm = normalize_epidome_object(eo_ASV_combined)
    p1_seqs = unlist(lapply(as.vector(eo_ASV_combined$p1_seqs),function(x) substr(x,4,nchar(x))))
    #> p1_seqs
    #[1] "28"       "5"        "40"       "31,30"    "14"       "20"      
    #[7] "22"       "26"       "37"       "21"       "29"       "24"      
    #[13] "1"        "26"       "40"       "21"       "18"       "40"      
    #[19] "16"       "37"       "5,2"      "18"       "11"       "14"      
    #[25] "26"       "31,30"    "37"       "26"       "1"        "21"      
    #[31] "3"        "40"       "26"       "28"       "40"       "10"      
    #[37] "5"        "40,31,30" "37"       "37"       "37"       "40"      
    #[43] "28"       "26"       "37"       "22"       "28"       "19"      
    #[49] "5,2"      "26"       "40"       "40"       "26"       "40"      
    #[55] "28"       "28"       "28" 
    p2_seqs = unlist(lapply(as.vector(eo_ASV_combined$p2_seqs),function(x) substr(x,4,nchar(x))))
    n_samples = length(eo_ASV_combined$sample_names)
    n_p1_seqs = length(p1_seqs)
    count_table = matrix(nrow = 0, ncol = n_samples,dimnames = list('ST'=c(),'Samples'=eo_ASV_combined$sample_names))
    match_type_table = matrix(nrow = n_p1_seqs, ncol = n_samples)
    count_table_names = c()
    unclassified_count_vec = rep(0,n_samples)
    g1_unclassified_count_vec = rep(0,n_samples)
    for (i in 1:n_p1_seqs) {
      #i=4: "31,30" --> length(p1_seq_split)>1 --> possible_STs==("-","-","8","215") --> Unclassified!
      p1_seq = p1_seqs[i]
      p1_seq_split = strsplit(p1_seq,',')[[1]]
      if (length(p1_seq_split)>1) {
        possible_STs = as.vector(ST_amplicon_table$ST)[which(ST_amplicon_table$epi01_ASV %in% p1_seq_split)]
        if (length(possible_STs)==1) {
          p1_seq = p1_seq_split[1]
        } else {
          p1_seq = "Unclassified"
        }
      }
      if (p1_seq != "Unclassified") {
        p1_seq_ST_table = ST_amplicon_table[which(ST_amplicon_table$epi01_ASV==p1_seq),]
        unique_p1_ASVs = unique(as.vector(p1_seq_ST_table$ST))
        p2_ASVs_matching_p1 = p1_seq_ST_table$epi02_ASV
        count_vec = rep(0,n_samples)
        for (j in 1:n_samples) {
          p1_percent = epidome_object_norm$p1_table[i,j]
          p1_count = eo_ASV_combined$p1_table[i,j]
          if (p1_percent > 0.01) {
            p2_seqs_present_within_difference_threshold_idx = which(epidome_object_norm$p2_table[,j]>(p1_percent-10) & epidome_object_norm$p2_table[,j]>1)
            p2_seqs_present_ASVs = p2_seqs[p2_seqs_present_within_difference_threshold_idx]
            p2_seqs_present_ASVs_matching_p1 = p2_seqs_present_ASVs[which(p2_seqs_present_ASVs %in% p2_ASVs_matching_p1)]
            p2_percent = epidome_object_norm$p2_table[which(p2_seqs %in% p2_seqs_present_ASVs_matching_p1),j]
            p2_count = eo_ASV_combined$p2_table[which(p2_seqs %in% p2_seqs_present_ASVs_matching_p1),j]
            if (length(p2_seqs_present_ASVs_matching_p1) == 0) {
              ST_order = order(p1_seq_ST_table$freq,decreasing = T)
              classification_group = as.vector(p1_seq_ST_table$ST)[ST_order[1]]
              if (classification_group %in% count_table_names) {
                classification_idx = which(count_table_names == classification_group)
                count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count
              } else {
                count_vec = rep(0,n_samples)
                count_vec[j] = p1_count
                count_table = rbind(count_table,count_vec)
                count_table_names = c(count_table_names,classification_group)
              }
              #match_type_table[i,j] = "epi01 match without epi02 match"
              # Replace the match type with actual epi01 and epi02 values
              match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:NA", " ", classification_group)
            }
            else if (length(p2_seqs_present_ASVs_matching_p1) == 1) {
              p1_p2_seq_ST_table = p1_seq_ST_table[which(p1_seq_ST_table$epi02_ASV==p2_seqs_present_ASVs_matching_p1[1]),]
              ST_order = order(p1_p2_seq_ST_table$freq,decreasing = T)
              classification_group = as.vector(p1_p2_seq_ST_table$ST)[ST_order[1]]
              if (classification_group %in% count_table_names) {
                classification_idx = which(count_table_names == classification_group)
                count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count
              } else {
                count_vec = rep(0,n_samples)
                count_vec[j] = p1_count
                count_table = rbind(count_table,count_vec)
                count_table_names = c(count_table_names,classification_group)
              }
              #match_type_table[i,j] = "Unique epi01 epi02 combination"
              # Replace the match type with actual epi01 and epi02 values
              match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:", p2_seqs_present_ASVs_matching_p1[1], " ", classification_group)
            } else {
              if (strict_classifier) {
                unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count
              } else {
                p1_p2_seq_ST_table = p1_seq_ST_table[which(p1_seq_ST_table$epi02_ASV %in% p2_seqs_present_ASVs_matching_p1),]
                ST_order = order(p1_p2_seq_ST_table$freq,decreasing = T)
                classification_group = as.vector(p1_p2_seq_ST_table$ST)[ST_order[1]]
                if (classification_group %in% count_table_names) {
                  classification_idx = which(count_table_names == classification_group)
                  count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count
                } else {
                  count_vec = rep(0,n_samples)
                  count_vec[j] = p1_count
                  count_table = rbind(count_table,count_vec)
                  count_table_names = c(count_table_names,classification_group)
                }
              }
    
              ##unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count
              #match_type_table[i,j] = c("Non unique epi01 epi02 combination")
              # Concatenate all found epi02 values and replace the match type
              epi02_values = paste(p2_seqs_present_ASVs_matching_p1, collapse = ",")
              match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:", epi02_values, " ", classification_group)
            }
    
          } else {
            match_type_table[i,j] = "Low counts"
            unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count
          }
    
        }
      } else {
        count_vec = as.numeric(as.vector(eo_ASV_combined$p1_table[i,]))
        unclassified_count_vec = unclassified_count_vec+count_vec
        g1_unclassified_count_vec = g1_unclassified_count_vec+count_vec
        match_type_vec = rep("Unclassified epi01 match",n_samples)
        match_type_table[i,] = match_type_vec
      }
    
    }
    count_table = rbind(count_table,unclassified_count_vec)
    rownames(count_table) = c(count_table_names,"Unclassified")
    count_table_df <- as.data.frame(count_table)
    write.csv(count_table_df, file="count_table_df.csv")
    #TODO: delete the last record "Unclassified" due to the few reads!
    #count_table_df = count_table_df[-32,]
    
    # --> FOR STEP4: prepare the file match_type_table.csv for treeHeatmap drawing.
    write.csv(match_type_table, file = "match_type_table.csv", row.names = TRUE)
    
    count_table_df_ordered = count_table_df[order(rowSums(count_table_df),decreasing = T),]
    row_sums <- apply(count_table_df_ordered, MARGIN=1, FUN=sum)
    #TODO: the ST in which the counts <= 500 not shown!
    #NOTE: ST225 is "epi01:3 epi02:3 225"!
    # Since "epi01:3 epi02:NA 225","epi01:3 epi02:NA 225" are 225 disappeared in the match_type_table.csv_____
    #Note that epi01 is g216, epi02 is yycH.
    #        215            -            8           59          130          297 
    #      611637       401179       341187       337409       316389       316273 
    #         331            2           73          384          200            5 
    #      178616       125734        99899        97885        93613        92385 
    #          14          218           23          278          487          290 
    #       71417        62467        60192        57213        53630        51454 
    #          87           89          100          329          153           83 
    #       49000        23390        21880        21635        19453        10539 
    #          19          170          225*         570          184           88 
    #        9465         1839         1487          863          437          318 
    #         114 Unclassified 
    #         108           11 
    #> row_sums
    #         215            -           59 Unclassified          297          130 
    #      611915       442192       359708       357868       335880       315734 
    #           5           73          331          384          200           14 
    #      145578       134301       127734       126465        94917        85440 
    #           2          218           23          278          290           87 
    #       85438        62492        60192        57213        51363        49000 
    #          89          100          329          210           83           19 
    #       23390        21880        21635        12663        10642         9465 
    #         170          225          570           88          114           10 
    #        1839         1487          863          318          108           91 
    # > unique(sort(eo_ASV_combined$p1_seqs))
    #  [1] "seq1"  "seq10" "seq11" "seq14" "seq16" "seq18" "seq19" "seq2"  "seq20"
    # [10] "seq21" "seq22" "seq24" "seq26" "seq28" "seq29" "seq3"  "seq31" "seq37"
    # [19] "seq40" "seq5" 
    # > unique(sort(eo_ASV_combined$p2_seqs))
    #  [1] "seq10" "seq11" "seq12" "seq14" "seq15" "seq18" "seq19" "seq2"  "seq21"
    # [10] "seq23" "seq25" "seq26" "seq3"  "seq34" "seq35" "seq36" "seq38" "seq42"
    # [19] "seq43" "seq49" "seq5"  "seq52" "seq53" "seq55" "seq56" "seq61" "seq62"
    # [28] "seq63" "seq65" "seq7"
    #row.names(count_table_df) <- c("ST215","ST130","ST278","ST200","ST184","ST5","ST59","ST83","ST487","ST114","ST8","ST297","ST153","ST384","ST2","ST14","ST89","ST570","-","ST290","ST331","ST73","ST88","ST100","ST87","ST23","ST218","ST329","ST19","ST225","ST170")
    #row.names(count_table_df) <- c("215","130","278","200","184","5","59","83","487","114","8","297","153","384","2","14","89","570","-","290","331","73","88","100","87","23","218","329","19","225","170")
    #row.names(count_table_df) <- c("ST215","ST130","ST278","ST200","ST5","ST59","ST83","ST114","ST297","ST384","ST14","ST89","ST210","-","ST328","ST331","ST73","ST2","ST88","ST100","ST10","ST290","ST87","ST23","ST218","ST329","ST19","ST225","ST170","Unclassified")
    
    col_order <- c("P7.Nose","P8.Nose","P9.Nose","P10.Nose","P11.Nose","P12.Nose","P13.Nose","P14.Nose","P15.Nose",     "P16.Foot","P17.Foot","P18.Foot","P19.Foot","P20.Foot","P16.Nose","P17.Nose","P18.Nose","P19.Nose","P20.Nose",    "AH.LH","AH.NLH","AH.Nose", "AL.LH","AL.NLH","AL.Nose", "CB.LH","CB.NLH","CB.Nose", "HR.LH","HR.NLH","HR.Nose", "KK.LH","KK.NLH","KK.Nose", "MC.LH","MC.NLH","MC.Nose", "MR.LH","MR.NLH","MR.Nose",  "PT2.LH","PT2.NLH","PT2.Nose", "RP.LH","RP.NLH","RP.Nose", "SA.LH","SA.NLH","SA.Nose", "XN.LH","XN.NLH","XN.Nose") 
    count_table_df_reordered <- count_table_df_ordered[,col_order]
    p = make_barplot_epidome(count_table_df_reordered,reorder=FALSE,normalize=TRUE)
    #p = make_barplot_epidome(count_table_df_reordered,reorder=TRUE,normalize=TRUE)
    png("Barplot_All.png", width=1600, height=900)
    p
    dev.off()
    
    #Change rowname from '-' to 'Novel'
    rownames(count_table_df_reordered)[rownames(count_table_df_reordered) == "-"] <- "Novel"
    write.csv(file="count_table_df_reordered.txt", count_table_df_reordered)
    
  4. prepare data for plotTreeHeatmap: table+tree.

      #replace "," to \n in match_type_table.csv
      sed 's/\",\"/\n/g' match_type_table.csv > match_type_table.csv_
      grep "epi02" match_type_table.csv_ > match_type_table.csv__
      grep -v "NA" match_type_table.csv__ > match_type_table.csv___
      #(Deprecated) awk -F' ' '{ split($2, a, ":"); split(a[2], b, ","); print $1 " epi02:" b[1] " " $3 }' match_type_table.csv___ > match_type_table.csv____
      #(Deprecated) sed -i -e 's/\"//g' match_type_table.csv____
      #(Deprecated) sort match_type_table.csv____ -u > match_type_table.csv_____
    
      #Manually select and merge items from match_type_table.csv___ and save it to match_type_table.csv____ (see content as below)!
          epi01:1 epi02:36 2
          epi01:3 epi02:3 225
          epi01:5 epi02:34 130
          epi01:5 epi02:3 278
          epi01:5 epi02:43 200
          epi01:5 epi02:36 184
          epi01:11 epi02:19 19
          epi01:14 epi02:34 297
          epi01:14 epi02:36 153
          epi01:16 epi02:56 329
          epi01:16 epi02:55 329
          epi01:18 epi02:14 218
          epi01:19 epi02:43 170
          epi01:2 epi02:15 -
          epi01:20 epi02:36 2
          epi01:20 epi02:43 14
          epi01:20 epi02:42 89
          epi01:20 epi02:34 384
          epi01:20 epi02:11 570
          epi01:21 epi02:34 100
          epi01:21 epi02:63 290
          epi01:24 epi02:38 23
          epi01:24 epi02:2 87
          epi01:26 epi02:34 -
          epi01:26 epi02:53 290
          epi01:26 epi02:63 331
          epi01:28 epi02:43 215
          epi01:29 epi02:43 215
          epi01:29 epi02:55 -
          epi01:31 epi02:43 8
          epi01:37 epi02:36 2
          epi01:37 epi02:26 88
          epi01:37 epi02:43 73
          epi01:40 epi02:2 5
          epi01:40 epi02:34 5
          epi01:40 epi02:34 59
          epi01:40 epi02:26 114
          epi01:40 epi02:43 83
          epi01:40 epi02:36 487
          #---- v2 ('-' three times, ST215, ST290, ST329, ST59 two times) --> delete 'epi01:28 epi02:43 215' ----
          epi01:1 epi02:34 -
          epi01:3 epi02:3 225
          epi01:5 epi02:3 278
          epi01:5 epi02:34 130
          epi01:5 epi02:43 200
          epi01:11 epi02:19 19
          epi01:14 epi02:34 297
          epi01:16 epi02:55 329
          epi01:16 epi02:56 329
          epi01:18 epi02:14 218
          epi01:19 epi02:43 170
          epi01:20 epi02:42 89
          epi01:20 epi02:43 14
          epi01:20 epi02:34 384
          epi01:20 epi02:11 570
          epi01:20 epi02:3 210
          epi01:21 epi02:38 10
          epi01:21 epi02:34 100
          epi01:21 epi02:63 290
          epi01:24 epi02:38 23
          epi01:24 epi02:2 87
          epi01:26 epi02:34 -
          epi01:26 epi02:53 290
          epi01:26 epi02:63 331
          epi01:29 epi02:43 215
          epi01:29 epi02:55 -
          epi01:37 epi02:62 2
          epi01:37 epi02:26 88
          epi01:37 epi02:43 73
          epi01:37 epi02:34 59
          epi01:40 epi02:2 5
          epi01:40 epi02:26 114
          epi01:40 epi02:43 83
          epi01:40 epi02:34 59
    
    python3 concat_epi01_epi02.py
        # Define the function to extract fasta sequences
        def fasta_to_dict(filename):
            sequences = {}
            header = None
            sequence = []
            with open(filename, 'r') as file:
                for line in file:
                    line = line.strip()
                    if line.startswith('>'):
                        if header:
                            sequences[header] = ''.join(sequence)
                        header = line[1:]
                        sequence = []
                    else:
                        sequence.append(line)
                if header:
                    sequences[header] = ''.join(sequence)
            return sequences
        # Read combinations from match_type_table.csv____
        combinations = []
        with open("match_type_table.csv____", 'r') as csv_file:
            for line in csv_file:
                parts = line.strip().split()
                epi01 = parts[0].split(":")[1]
                epi02 = parts[1].split(":")[1]
                st = parts[2]
                combinations.append((epi01, epi02, st))
    
        # Read both fasta files into dictionaries
        g216_sequences = fasta_to_dict("../epidome/DB/epi01_ref_aln.fasta")
        yycH_sequences = fasta_to_dict("../epidome/DB/epi02_ref_aln.fasta")
    
        output_filename = "concatenated_sequences.fasta"
        with open(output_filename, 'w') as output_file:
            for epi01, epi02, st in combinations:
                if epi01 in g216_sequences and epi02 in yycH_sequences:
                    concatenated_seq = g216_sequences[epi01] + "NNN" + yycH_sequences[epi02]
                    header = f">g216-{epi01}_yycH-{epi02}_ST{st}"
                    output_file.write(f"{header}\n{concatenated_seq}\n")
    
        print(f"Concatenated sequences saved to {output_filename}")
    
    sed -i -e 's/ST-/-/g' concatenated_sequences.fasta
    muscle -in concatenated_sequences.fasta -out aligned.fasta
    muscle -clw -in concatenated_sequences.fasta -out aligned.clw
    FastTree -nt < aligned.fasta > concatenated_sequences.tree
    # Run plotTree.R to draw ggtree_and_gheatmap.png.
    
    library(ggtree)
    library(ggplot2)
    
    setwd("/home/jhuang/DATA/Data_Luise_Epidome_batch3/run_2023_98_2/")
    
    # -- edit tree --
    #https://icytree.org/
    info <- read.csv("typing_34.csv")
    info$name <- info$Isolate
    tree <- read.tree("concatenated_sequences.tree")
    #cols <- c(infection='purple2', commensalism='skyblue2')
    
    library(dplyr)
    heatmapData2 <- info %>% select(Isolate,  g216, ST)
    rn <- heatmapData2$Isolate
    heatmapData2$Isolate <- NULL
    heatmapData2 <- as.data.frame(sapply(heatmapData2, as.character))
    rownames(heatmapData2) <- rn
    
    #https://bookdown.org/hneth/ds4psy/D-3-apx-colors-basics.html
    #"g216:16","g216:18","g216:19","g216:20"(4),"g216:21"(2),"g216:24"(1),"g216:26"(2)
    heatmap.colours <- c("darkgrey",   "palegreen","seagreen","seagreen","seagreen4","seagreen4","yellowgreen",    "orange4","orange4","orange4","orange4","orange4",    "dodgerblue3","blue4","dodgerblue3","dodgerblue3",  "azure4",    "magenta2","maroon2","mediumorchid1","mediumorchid1","mediumorchid1","mediumorchid1","mediumorchid1", "mediumorchid3","mediumorchid3","mediumorchid3","mediumpurple1","mediumpurple4","mediumpurple4",         "red4",                                            "olivedrab3","olivedrab4","seagreen","seagreen4","yellowgreen",    "orange4",    "blue4","dodgerblue3",           "magenta2","maroon2","mediumorchid1", "mediumorchid3","mediumpurple1","mediumpurple4",         "red4")
    names(heatmap.colours) <- c("-",    "ST225","ST130","ST278","ST19","ST200", "ST297",                                             "ST5","ST59","ST114","ST83","ST487",   "ST2","ST215","ST73","ST88",   "ST8",                                                               "ST329","ST170", "ST89","ST14","ST570","ST384","ST210",                  "ST10","ST100","ST290","ST23","ST87", "ST331",     "ST218",      "g216:1","g216:3","g216:5","g216:11","g216:14",   "g216:40",   "g216:29","g216:37",        "g216:16","g216:19","g216:20","g216:21","g216:24","g216:26",    "g216:18")
    #mydat$Regulation <- factor(mydat$Regulation, levels=c("up","down"))
    
    #circular
    #geom_tippoint(aes(color=Type)) + scale_color_manual(values=cols) + 
    p <- ggtree(tree, layout='circular', branch.length='none') %<+% info + geom_tiplab2(aes(label=name), offset=1)
    #, geom='text', align=TRUE,  linetype=NA, hjust=1.8,check.overlap=TRUE, size=3.3
    #difference between geom_tiplab and geom_tiplab2?
    #+ theme(axis.text.x = element_text(angle = 30, vjust = 0.5)) + theme(axis.text = element_text(size = 20))  + scale_size(range = c(1, 20))
    #font.size=10, 
    png("ggtree.png", width=1260, height=1260)
    #svg("ggtree.svg", width=1260, height=1260)
    p
    dev.off()
    
    desired_order <- c("-",    "ST225","ST130","ST278","ST19","ST200", "ST297",                                             "ST5","ST59","ST114","ST83","ST487",   "ST2","ST215","ST73","ST88",   "ST8",                                                               "ST329","ST170", "ST89","ST14","ST570","ST384","ST210",                  "ST10","ST100","ST290","ST23","ST87", "ST331",     "ST218",      "g216:1","g216:3","g216:5","g216:11","g216:14",   "g216:40",   "g216:29","g216:37",        "g216:16","g216:19","g216:20","g216:21","g216:24","g216:26",    "g216:18")
    
    png("ggtree_and_gheatmap.png", width=1290, height=1000)
    #svg("ggtree_and_gheatmap.svg", width=17, height=15)
    gheatmap(p, heatmapData2, width=0.15,colnames_position="top", colnames_angle=90, colnames_offset_y = 0.1, hjust=0.5, font.size=4, offset = 9) + scale_fill_manual(values=heatmap.colours, breaks=desired_order) +  theme(legend.text = element_text(size = 14)) + theme(legend.title = element_text(size = 14)) + guides(fill=guide_legend(title=""), color = guide_legend(override.aes = list(size = 5)))  
    dev.off()
    
  5. prepre the table for the alpha diversity calculation

    To calculate the alpha diversity values for each row based on the provided metrics (chao1, observed_otus, shannon, PD_whole_tree), you'll typically use specialized software packages such as QIIME or R packages like vegan. Here, I can provide a high-level approach and calculations for some of the metrics. However, some of the metrics like PD_whole_tree require phylogenetic trees, which can't be derived from the table you provided.

    import pandas as pd
    import numpy as np
    
    # Read the data from CSV
    df = pd.read_csv('count_table_df_reordered.csv', index_col=0)
    
    def chao1(s):
    s = np.array(s)
    n = np.sum(s)
    s_obs = np.count_nonzero(s)
    s1 = np.sum(s == 1)
    s2 = np.sum(s == 2)
    return s_obs + ((s1 ** 2) / (2 * s2)) if s2 != 0 else s_obs
    
    def observed_sts(s):
    return np.count_nonzero(s)
    
    def shannon2(s):
    s = np.array(s)
    p = s / np.sum(s)
    return -np.sum(p * np.log(p + 1e-10))  # added small value to avoid log(0)
    
    def shannon(s):
    s = np.array(s)
    total = np.sum(s)
    p = s / total  # Convert absolute counts to proportions
    return -np.sum(p * np.log(p + 1e-10))  # added small value to avoid log(0)
    
    # Calculate the metrics for each sample (column)
    results = {
    'sample': [],
    'chao1': [],
    'observed_sts': [],
    'shannon': []
    }
    
    for column in df.columns:
    results['sample'].append(column)
    results['chao1'].append(chao1(df[column]))
    results['observed_sts'].append(observed_sts(df[column]))
    results['shannon'].append(shannon(df[column]))
    
    # Convert results to DataFrame and save to a new CSV
    result_df = pd.DataFrame(results)
    result_df.to_csv('alpha_diversity_metrics_samples.csv', index=False)
    

    For PD_whole_tree, you would require a phylogenetic tree of the OTUs and then compute the sum of branch lengths. This is more complex and requires specialized software and the phylogenetic tree information, which isn't present in the table you provided.

    The input data being absolute counts versus relative abundance does influence certain alpha diversity metrics. Let's re-evaluate:

    • Observed OTUs: Whether the data is in absolute counts or relative abundance doesn't impact the "Observed OTUs" metric. It's simply counting how many OTUs have a non-zero count.

    • Chao1: Chao1 is intended for absolute counts. The metric is based on the number of singletons and doubletons, which are derived from raw counts and not relative abundances. Hence, the calculation remains accurate.

    • Shannon Diversity Index: The Shannon Diversity Index is based on the relative abundances of OTUs, not their absolute counts. The formula requires the proportion of each species (or OTU) in the sample. Thus, you'd need to convert absolute counts to proportions (or relative abundance) before calculating this metric.

  6. calculate the alpha diversity in Phyloseq.Rmd. The code for the calculation is as follows.

    \pagebreak
    
    # Alpha diversity for ST
    Plot Shannon index and observed STs. 
    Regroup together samples from the same group.
    
    ```{r, echo=TRUE, warning=FALSE}
    hmp.div_st <- read.csv("alpha_diversity_metrics_samples.csv", sep=",") 
    colnames(hmp.div_st) <- c("sample", "chao1", "observed_sts", "shannon")
    row.names(hmp.div_st) <- hmp.div_st$sample
    div.df <- merge(hmp.div_st, hmp.meta, by.x="sample", by.y = "sam_name")
    div.df2 <- div.df[, c("Group", "shannon", "observed_sts")]
    colnames(div.df2) <- c("Group", "Shannon", "ST")
    #colnames(div.df2)
    options(max.print=999999)
    
    stat.test.Shannon <- compare_means(
    Shannon ~ Group, data = div.df2,
    method = "t.test"
    )
    knitr::kable(stat.test.Shannon) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
    
    stat.test.ST <- compare_means(
    ST ~ Group, data = div.df2,
    method = "t.test"
    )
    knitr::kable(stat.test.ST) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
    
    div_df_melt <- reshape2::melt(div.df2)
    
    p <- ggboxplot(div_df_melt, x = "Group", y = "value",
                  facet.by = "variable", 
                  scales = "free",
                  width = 0.5,
                  fill = "gray", legend= "right")
    #ggpar(p, xlab = FALSE, ylab = FALSE)
    lev <- levels(factor(div_df_melt$Group)) # get the variables
    L.pairs <- combn(seq_along(lev), 2, simplify = FALSE, FUN = function(i) lev[i]) #%>% filter(p.signif != "ns")
    my_stat_compare_means  <- function (mapping = NULL, data = NULL, method = NULL, paired = FALSE, 
        method.args = list(), ref.group = NULL, comparisons = NULL, 
        hide.ns = FALSE, label.sep = ", ", label = NULL, label.x.npc = "left", 
        label.y.npc = "top", label.x = NULL, label.y = NULL, tip.length = 0.03, 
        symnum.args = list(), geom = "text", position = "identity", 
        na.rm = FALSE, show.legend = NA, inherit.aes = TRUE, ...) 
    {
        if (!is.null(comparisons)) {
            method.info <- ggpubr:::.method_info(method)
            method <- method.info$method
            method.args <- ggpubr:::.add_item(method.args, paired = paired)
            if (method == "wilcox.test") 
                method.args$exact <- FALSE
            pms <- list(...)
            size <- ifelse(is.null(pms$size), 0.3, pms$size)
            color <- ifelse(is.null(pms$color), "black", pms$color)
            map_signif_level <- FALSE
            if (is.null(label)) 
                label <- "p.format"
            if (ggpubr:::.is_p.signif_in_mapping(mapping) | (label %in% "p.signif")) {
                if (ggpubr:::.is_empty(symnum.args)) {
                    map_signif_level <- c(`****` = 1e-04, `***` = 0.001, 
                      `**` = 0.01, `*` = 0.05, ns = 1)
                } else {
                  map_signif_level <- symnum.args
                } 
                if (hide.ns) 
                    names(map_signif_level)[5] <- " "
            }
            step_increase <- ifelse(is.null(label.y), 0.12, 0)
            ggsignif::geom_signif(comparisons = comparisons, y_position = label.y, 
                test = method, test.args = method.args, step_increase = step_increase, 
                size = size, color = color, map_signif_level = map_signif_level, 
                tip_length = tip.length, data = data)
        } else {
            mapping <- ggpubr:::.update_mapping(mapping, label)
            layer(stat = StatCompareMeans, data = data, mapping = mapping, 
                geom = geom, position = position, show.legend = show.legend, 
                inherit.aes = inherit.aes, params = list(label.x.npc = label.x.npc, 
                    label.y.npc = label.y.npc, label.x = label.x, 
                    label.y = label.y, label.sep = label.sep, method = method, 
                    method.args = method.args, paired = paired, ref.group = ref.group, 
                    symnum.args = symnum.args, hide.ns = hide.ns, 
                    na.rm = na.rm, ...))
        }
    }
    
    p3 <- p + 
      stat_compare_means(
        method="t.test",
        comparisons = list(c("P16-P20.Foot", "P16-P20.Nose"), c("AH-XN.LH", "AH-XN.Nose"), c("AH-XN.NLH", "AH-XN.Nose")), 
        label = "p.signif",
        symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")),
      ) +
      theme(axis.text.x = element_text(angle = 30, hjust = 1))  # add this line to rotate x-axis text
    print(p3)
    
    ggsave("./figures/alpha_diversity_ST_Group2.png", device="png", height = 10, width = 12)
    ggsave("./figures/alpha_diversity_ST_Group2.svg", device="svg", height = 10, width = 12)
    ```
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum