RNA-seq data analysis (R-part) for S. epidermidis 1585

gene_x 0 like s 294 view s

Tags: R, processing, RNA-seq

Data input and generate PCA plot

# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)
library(tximport)
library(DESeq2)
setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon")
# Define paths to your Salmon output quantification files
files <- c("15m_A" = "./EX_15_min_A/quant.sf",
          "15m_B" = "./EX_15_min_B/quant.sf",
          "15m_C" = "./EX_15_min_C/quant.sf",
          "1h_A" = "./EX_1h_A/quant.sf",
          "1h_B" = "./EX_1h_B/quant.sf",
          "1h_C" = "./EX_1h_C/quant.sf",
          "2h_A" = "./EX_2h_A/quant.sf",
          "2h_B" = "./EX_2h_B/quant.sf",
          "2h_C" = "./EX_2h_C/quant.sf",
          "4h_A" = "./EX_4h_A/quant.sf",
          "4h_B" = "./EX_4h_B/quant.sf",
          "4h_C" = "./EX_4h_C/quant.sf",
          "6h_A" = "./EX_6h_A/quant.sf",
          "6h_B" = "./EX_6h_B/quant.sf",
          "6h_C" = "./EX_6h_C/quant.sf",
          "4d_A" = "./EX_Day_4_A/quant.sf",
          "4d_B" = "./EX_Day_4_B/quant.sf",
          "4d_C" = "./EX_Day_4_C/quant.sf")
# Import the transcript abundance data with tximport
txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
# Define the replicates and condition of the samples
replicate <- factor(c("A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C", "A", "B", "C"))
condition <- factor(c("15m", "15m", "15m", "1h", "1h", "1h", "2h", "2h", "2h", "4h", "4h", "4h", "6h", "6h", "6h", "4d", "4d", "4d"))
# Define the colData for DESeq2
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
# Create DESeqDataSet object
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
# In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
# Filter data to retain only genes with more than 2 counts > 3 across all samples
# dds <- dds[rowSums(counts(dds) > 3) > 2, ]
# Output raw count data to a CSV file
write.csv(counts(dds), file="transcript_counts.csv")
# -- gene-level count data --
# Read in the tx2gene map from salmon_tx2gene.tsv
#tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
# Set the column names
colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
# Remove the gene_name column if not needed
tx2gene <- tx2gene[,1:2]
# Import and summarize the Salmon data with tximport
txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
# Continue with the DESeq2 workflow as before...
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
#dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
#~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls

#TODO: why a lot of reads were removed due to the too_short?
#STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
dim(counts(dds))
head(counts(dds), 10)
#DEBUG: DESeq should not used here!?
#TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 
#dds <- DESeq(dds)
rld <- rlogTransformation(dds)

# draw simple pca and heatmap
library(gplots) 
library("RColorBrewer")
#mat <- assay(rld)
#mm <- model.matrix(~condition, colData(rld))
#mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
#assay(rld) <- mat
# -- pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()
# -- heatmap --
png("heatmap.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()

Analysis on differentially expressed genes

setwd("/home/jhuang/DATA/Data_Marc_RNA-seq_Sepidermidis/results_1585/star_salmon/degenes")

dds$condition <- relevel(dds$condition, "15m")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("1h_vs_15m", "2h_vs_15m", "4h_vs_15m", "6h_vs_15m",  "4d_vs_15m")

for (i in clist) {
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  res_df <- as.data.frame(res)
  write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
  up <- subset(res_df, padj<=0.05 & log2FoldChange>=2)
  down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

library("EnhancedVolcano")
for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do
  echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)"
  echo "res_df <- as.data.frame(res)"
  echo "png(\"${i}.png\",width=800, height=600)"
  #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
  #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))"
  echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
  echo "dev.off()"
done

res <- read.csv(file = paste("1h_vs_15m", "all.txt", sep="-"), row.names = 1)
res_df <- as.data.frame(res)
png("1h_vs_15m.png",width=1000, height=800)
EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 1), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("1h versus 15m"))
dev.off()

res <- read.csv(file = paste("2h_vs_15m", "all.txt", sep="-"), row.names = 1)
res_df <- as.data.frame(res)
png("2h_vs_15m.png",width=1000, height=1000)
EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 4), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("2h versus 15m"))
dev.off()

res <- read.csv(file = paste("4h_vs_15m", "all.txt", sep="-"), row.names = 1)
res_df <- as.data.frame(res)
png("4h_vs_15m.png",width=1000, height=1200)
EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 5.5), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4h versus 15m"))
dev.off()

res <- read.csv(file = paste("6h_vs_15m", "all.txt", sep="-"), row.names = 1)
res_df <- as.data.frame(res)
png("6h_vs_15m.png",width=1000, height=1600)
EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', ylim = c(0, 17), pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("6h versus 15m"))
dev.off()

res <- read.csv(file = paste("4d_vs_15m", "all.txt", sep="-"), row.names = 1)
res_df <- as.data.frame(res)
png("4d_vs_15m.png",width=1000, height=1600)
EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=2.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("4d versus 15m"))
dev.off()

#under DIR degenes under KONSOLE
for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do 
echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; 
done

Clustering the genes and draw heatmap

for i in 1h_vs_15m 2h_vs_15m 4h_vs_15m 6h_vs_15m 4d_vs_15m; do
echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id
RNASeq.NoCellLine <- assay(rld)
#install.packages("gplots")
library("gplots")
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
datamat = RNASeq.NoCellLine[GOI, ]
#datamat = RNASeq.NoCellLine
write.csv(as.data.frame(datamat), file ="gene_expressions.txt")
constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
if(any(constant_rows)) {
  cat("Removing", sum(constant_rows), "constant rows.\n")
  datamat <- datamat[!constant_rows, ]
}
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.4)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "MAGENTA", "CYAN", "GREEN", "RED", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTGREEN"); #"LIGHTRED", 
mycol = mycol[as.vector(mycl)]
#png("DEGs_heatmap.png", width=900, height=800)
#cex.lab=10, labRow="",
png("DEGs_heatmap.png", width=900, height=1000)
#heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
#            scale='row',trace='none',col=bluered(75), 
#            RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
# cexCol, cexRow
heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,2), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=40, lhei=c(1,7), cexCol=2)
dev.off()

#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
write.csv(names(subset(mycl, mycl == '4')),file='cluster4_MAGENTA.txt')
write.csv(names(subset(mycl, mycl == '5')),file='cluster5_CYAN.txt')
write.csv(names(subset(mycl, mycl == '6')),file='cluster6_GREEN.txt')
~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
#~/Tools/csv2xls-0.4/csv_to_xls.py significant_gene_expressions.txt -d',' -o DEGs_heatmap_expression_data.xls;

Generate Excel files from Genbank-file

python3 /home/jhuang/Scripts/gb_to_excel.py 1585.gb 1585.xlsx
#insert C./P. (Chrom or Plasmid1, Plasmid2, Plasmid3) as the second chrom in the 1585.xlsx.

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

RNA-seq data analysis (R-part) for S. epidermidis 1585

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章