RNA-seq data analysis (R-part) for S. epidermidis 1585

gene_x 0 like s 199 view s

Tags: R, processing, RNA-seq

  1. Data input and generate PCA plot

    # Import the required libraries
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library(gplots)
    library(tximport)
    library(DESeq2)
    setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon")
    # Define paths to your Salmon output quantification files
    files <- c("15m_A" = "./EX_15_min_A/quant.sf",
              "15m_B" = "./EX_15_min_B/quant.sf",
              "15m_C" = "./EX_15_min_C/quant.sf",
              "1h_A" = "./EX_1h_A/quant.sf",
              "1h_B" = "./EX_1h_B/quant.sf",
              "1h_C" = "./EX_1h_C/quant.sf",
              "2h_A" = "./EX_2h_A/quant.sf",
              "2h_B" = "./EX_2h_B/quant.sf",
              "2h_C" = "./EX_2h_C/quant.sf",
              "4h_A" = "./EX_4h_A/quant.sf",
              "4h_B" = "./EX_4h_B/quant.sf",
              "4h_C" = "./EX_4h_C/quant.sf",
              "6h_A" = "./EX_6h_A/quant.sf",
              "6h_B" = "./EX_6h_B/quant.sf",
              "6h_C" = "./EX_6h_C/quant.sf",
              "4d_A" = "./EX_Day_4_A/quant.sf",
              "4d_B" = "./EX_Day_4_B/quant.sf",
              "4d_C" = "./EX_Day_4_C/quant.sf")
    # Import the transcript abundance data with tximport
    txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    # Define the replicates and condition of the samples
    replicate <- factor(c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C"))
    condition <- factor(c("15m", "15m", "15m", "1h", "1h", "1h", "2h", "2h", "2h", "4h", "4h", "4h", "6h", "6h", "6h", "4d", "4d", "4d"))
    # Define the colData for DESeq2
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    # Create DESeqDataSet object
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
    # Filter data to retain only genes with more than 2 counts > 3 across all samples
    # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
    # Output raw count data to a CSV file
    write.csv(counts(dds), file="transcript_counts.csv")
    # -- gene-level count data --
    # Read in the tx2gene map from salmon_tx2gene.tsv
    #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
    tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    # Set the column names
    colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    # Remove the gene_name column if not needed
    tx2gene <- tx2gene[,1:2]
    # Import and summarize the Salmon data with tximport
    txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    # Continue with the DESeq2 workflow as before...
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
    write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    #~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls
    
    #TODO: why a lot of reads were removed due to the too_short?
    #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
    dim(counts(dds))
    head(counts(dds), 10)
    #DEBUG: DESeq should not used here!?
    #TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 
    #dds <- DESeq(dds)
    rld <- rlogTransformation(dds)
    
    # draw simple pca and heatmap
    library(gplots) 
    library("RColorBrewer")
    #mat <- assay(rld)
    #mm <- model.matrix(~condition, colData(rld))
    #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
    #assay(rld) <- mat
    # -- pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("condition"))
    dev.off()
    # -- heatmap --
    png("heatmap.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
  2. Analysis on differentially expressed genes

    setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon/degenes")
    
    dds$condition <- relevel(dds$condition, "15m")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("1h_vs_15m", "2h_vs_15m", "4h_vs_15m", "6h_vs_15m",  "4d_vs_15m")
    
    for (i in clist) {
      contrast = paste("condition", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      res_df <- as.data.frame(res)
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
      up <- subset(res_df, padj<=0.05 & log2FoldChange>=2)
      down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    library("EnhancedVolcano")
    for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do
      echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)"
      echo "res_df <- as.data.frame(res)"
      echo "png(\"${i}.png\",width=800, height=600)"
      #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
      #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))"
      echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
      echo "dev.off()"
    done
    
    res <- read.csv(file = paste("1h_vs_15m", "all.txt", sep="-"), row.names = 1)
    res_df <- as.data.frame(res)
    png("1h_vs_15m.png",width=1000, height=800)
    EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 1), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("1h versus 15m"))
    dev.off()
    
    res <- read.csv(file = paste("2h_vs_15m", "all.txt", sep="-"), row.names = 1)
    res_df <- as.data.frame(res)
    png("2h_vs_15m.png",width=1000, height=1000)
    EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 4), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("2h versus 15m"))
    dev.off()
    
    res <- read.csv(file = paste("4h_vs_15m", "all.txt", sep="-"), row.names = 1)
    res_df <- as.data.frame(res)
    png("4h_vs_15m.png",width=1000, height=1200)
    EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 5.5), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4h versus 15m"))
    dev.off()
    
    res <- read.csv(file = paste("6h_vs_15m", "all.txt", sep="-"), row.names = 1)
    res_df <- as.data.frame(res)
    png("6h_vs_15m.png",width=1000, height=1600)
    EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 17), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("6h versus 15m"))
    dev.off()
    
    res <- read.csv(file = paste("4d_vs_15m", "all.txt", sep="-"), row.names = 1)
    res_df <- as.data.frame(res)
    png("4d_vs_15m.png",width=1000, height=1600)
    EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4d versus 15m"))
    dev.off()
    
    #under DIR degenes under KONSOLE
    for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do 
    echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; 
    done
    
  3. Clustering the genes and draw heatmap

    for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do
    echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
    cat *.id | sort -u > ids
    #add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id
    RNASeq.NoCellLine <- assay(rld)
    #install.packages("gplots")
    library("gplots")
    #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
    datamat = RNASeq.NoCellLine[GOI, ]
    #datamat = RNASeq.NoCellLine
    write.csv(as.data.frame(datamat), file ="gene_expressions.txt")
    constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
    if(any(constant_rows)) {
      cat("Removing", sum(constant_rows), "constant rows.\n")
      datamat <- datamat[!constant_rows, ]
    }
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.4)
    mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "MAGENTA", "CYAN", "GREEN", "RED", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTGREEN"); #"LIGHTRED", 
    mycol = mycol[as.vector(mycl)]
    #png("DEGs_heatmap.png", width=900, height=800)
    #cex.lab=10, labRow="",
    png("DEGs_heatmap.png", width=900, height=1000)
    #heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
    #            scale='row',trace='none',col=bluered(75), 
    #            RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
    # cexCol, cexRow
    heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,2), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=40, lhei=c(1,7), cexCol=2)
    dev.off()
    
    #### cluster members #####
    write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
    write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
    write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
    write.csv(names(subset(mycl, mycl == '4')),file='cluster4_MAGENTA.txt')
    write.csv(names(subset(mycl, mycl == '5')),file='cluster5_CYAN.txt')
    write.csv(names(subset(mycl, mycl == '6')),file='cluster6_GREEN.txt')
    ~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
    #~/Tools/csv2xls-0.4/csv_to_xls.py significant_gene_expressions.txt -d',' -o DEGs_heatmap_expression_data.xls;
    
  4. Generate Excel files from Genbank-file

    python3 /home/jhuang/Scripts/gb_to_excel.py 1585.gb 1585.xlsx
    #insert C./P. (Chrom or Plasmid1, Plasmid2, Plasmid3) as the second chrom in the 1585.xlsx.
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum