gene_x 0 like s 656 view s
Tags: R, pipeline, RNA-seq
The analysis provides insights into the osteoblasts' response when infected with two distinct strains of S. epidermidis: the biofilm-positive 1457 and its biofilm-negative counterpart 1457-M10. The PCA plot reveals nuanced differences between these infections at both 2-hour and 3-day intervals.
For a more detailed understanding, users can explore the DEGs_heatmap.png, showcasing a heatmap representation of all differentially expressed genes (DEGs). Associated expression data can be found in the gene_clusters.xls file.
The depth of this analysis is further evident in the detailed comparison sets, which include:
For visual clarity, each comparison set is complemented by a volcano plot and relevant data is organized in Excel tables. All necessary files and resources are provided for comprehensive exploration.
running nextflow and multiqc
ln -s /home/jhuang/Tools/nf-core-rnaseq-3.12.0/ rnaseq
(rnaseq) [jhuang@sage Data_Samira_RNAseq]$ nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full -resume --max_memory 256.GB --max_time 2400.h --save_align_intermeds --save_unaligned --aligner 'star_salmon' --skip_multiqc
#-- rerun multiqc --
ln -s ~/Tools/rnaseq/assets/multiqc_config.yaml multiqc_config.yaml
conda activate rnaseq
multiqc -f --config multiqc_config.yaml . 2>&1
rm multiqc_config.yaml
conda deactivate
R-code for evaluation
# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)
library(tximport)
library(DESeq2)
setwd("~/DATA/Data_Samira_RNAseq/results2_GRCh38/star_salmon")
# Define paths to your Salmon output quantification files, quant.sf refers to tx-counts, later will be summaried as gene-counts.
files <- c("1457.2h_r1" = "./1457.2h_r1/quant.sf",
"1457.2h_r2" = "./1457.2h_r2/quant.sf",
"1457.2h_r3" = "./1457.2h_r3/quant.sf",
"1457.3d_r1" = "./1457.3d_r1/quant.sf",
"1457.3d_r2" = "./1457.3d_r2/quant.sf",
"uninfected.2h_r1" = "./uninfected.2h_r1/quant.sf",
"uninfected.2h_r2" = "./uninfected.2h_r2/quant.sf",
"uninfected.3h_r3" = "./uninfected.3h_r3/quant.sf",
"uninfected.3d_r1" = "./uninfected.3d_r1/quant.sf",
"uninfected.3d_r2" = "./uninfected.3d_r2/quant.sf",
"1457-M10.2h_r1" = "./1457-M10.2h_r1/quant.sf",
"1457-M10.2h_r2" = "./1457-M10.2h_r2/quant.sf",
"1457-M10.2h_r3" = "./1457-M10.2h_r3/quant.sf",
"1457-M10.3d_r1" = "./1457-M10.3d_r1/quant.sf",
"1457-M10.3d_r2" = "./1457-M10.3d_r2/quant.sf",
"1457-M10.3d_r3" = "./1457-M10.3d_r3/quant.sf")
# ---- tx-level count data ----
# Import the transcript abundance data with tximport
txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
# Define the replicates and condition of the samples
#replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
condition <- factor(c("1457.2h", "1457.2h", "1457.2h", "1457.3d", "1457.3d", "uninfected.2h", "uninfected.2h", "uninfected.2h", "uninfected.3d", "uninfected.3d", "1457-M10.2h", "1457-M10.2h", "1457-M10.2h", "1457-M10.3d", "1457-M10.3d", "1457-M10.3d"))
# Define the colData for DESeq2
colData <- data.frame(condition=condition, row.names=names(files))
# Create DESeqDataSet object
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
# In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
# Filter data to retain only genes with more than 2 counts > 3 across all samples
# dds <- dds[rowSums(counts(dds) > 3) > 2, ]
# Output raw count data to a CSV file
write.csv(counts(dds), file="transcript_counts.csv")
# ---- gene-level count data ----
# Read in the tx2gene map from salmon_tx2gene.tsv
#tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
# Set the column names
colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
# Remove the gene_name column if not needed
tx2gene <- tx2gene[,1:2]
# Import and summarize the Salmon data with tximport
txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
# Continue with the DESeq2 workflow as before...
colData <- data.frame(condition=condition, row.names=names(files))
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
#dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543
dds <- DESeq(dds)
rld <- rlogTransformation(dds)
write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
#TODO: why a lot of reads were removed due to the too_short?
#STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
dim(counts(dds))
head(counts(dds), 10)
(optional) draw 3D PCA plots.
library(gplots)
library("RColorBrewer")
library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)
write.csv(data, file="plotPCA_data.csv")
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
#df_pc <- data.frame(pc$x[,1:3])
df_pc <- data.frame(pc$x)
identical(rownames(data), rownames(df_pc)) #-->TRUE
data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL # remove the "name" column
merged_df$name <- NULL
merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")]
write.csv(merged_df, file="merged_df_10PCs.csv")
summary(pc)
#0.5333 0.2125 0.06852
draw_3D.py
draw 2D PCA plots.
# -- pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()
# -- heatmap --
png("heatmap.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()
(optional) estimate size factors
> head(dds)
class: DESeqDataSet
dim: 6 10
metadata(1): version
assays(6): counts avgTxLength ... H cooks
rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460
ENSG00000000938
rowData names(34): baseMean baseVar ... deviance maxCooks
colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2
colData names(2): condition replicate
#convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
> sizeFactors(dds)
normalized_counts <- counts(dds, normalized=TRUE)
#write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
# ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ----
nm <- assays(dds)[["avgTxLength"]]
sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm)
assays(dds)$counts # for count data
assays(dds)$avgTxLength # for average transcript length, etc.
assays(dds)$normalizationFactors
In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet.
To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
> assays(dds)
List of length 6
names(6): counts avgTxLength normalizationFactors mu H cooks
To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE)
# ---- DEBUG END ----
#unter konsole
# control_r1 ...
# 1/0.9978755 ...
> sizeFactors(dds)
HeLa_TO_r1 HeLa_TO_r2
0.9978755 1.1092227
1/0.9978755=1.002129023
1/1.1092227=
#bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor --effectiveGenomeSize 2864785220
bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023 --effectiveGenomeSize 2864785220
bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor 0.901532217 --effectiveGenomeSize 2864785220
differential expressions
#Guideline for Comparisons:
# 1457.2h vs uninfected.2h
# 1457-M10.2h vs uninfected.2h
#
# 1457.3d vs uninfected.3d
# 1457-M10.3d vs uninfected.3d
#
# 1457.2h infection vs 1457-M10 2h infection
#
# 1457.3d infection vs 1457-M10 3 day infection
#
# 1457 3 days vs 1457 2h
#
# 1457-M10 3 days vs 1457-M10 2h
#"1457.2h", "1457.2h", "1457.2h", "1457.3d", "1457.3d", "uninfected.2h", "uninfected.2h", "uninfected.2h", "uninfected.3d", "uninfected.3d", "1457-M10.2h", "1457-M10.2h", "1457-M10.2h", "1457-M10.3d", "1457-M10.3d", "1457-M10.3d"
#A central method for exploring differences between groups of segments or samples is to perform differential gene expression analysis.
dds$condition <- relevel(dds$condition, "uninfected.2h")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("1457.M10.2h_vs_uninfected.2h","1457.2h_vs_uninfected.2h")
dds$condition <- relevel(dds$condition, "uninfected.3d")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("1457.M10.3d_vs_uninfected.3d","1457.3d_vs_uninfected.3d")
dds$condition <- relevel(dds$condition, "1457-M10.2h")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("1457.2h_vs_1457.M10.2h")
dds$condition <- relevel(dds$condition, "1457-M10.3d")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("1457.3d_vs_1457.M10.3d")
dds$condition <- relevel(dds$condition, "1457.2h")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("1457.3d_vs_1457.2h")
dds$condition <- relevel(dds$condition, "1457-M10.2h")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("1457.M10.3d_vs_1457.M10.2h")
##https://bioconductor.statistik.tu-dortmund.de/packages/3.7/data/annotation/
#BiocManager::install("EnsDb.Mmusculus.v79")
#library(EnsDb.Mmusculus.v79)
#edb <- EnsDb.Mmusculus.v79
#https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
#https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
library(biomaRt)
listEnsembl()
listMarts()
#ensembl <- useEnsembl(biomart = "genes", mirror="asia") # default is Mouse strains 104
#ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", mirror = "www")
#ensembl = useMart("ensembl_mart_44", dataset="hsapiens_gene_ensembl",archive=TRUE, mysql=TRUE)
#ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", version="104")
#ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="86")
#--> total 69, 27 GRCh38.p7 and 39 GRCm38.p4; we should take 104, since rnaseq-pipeline is also using annotation of 104!
ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
datasets <- listDatasets(ensembl)
#--> total 202 80 GRCh38.p13 107 GRCm39
#80 hsapiens_gene_ensembl Human genes (GRCh38.p13) GRCh38.p13
#107 mmusculus_gene_ensembl Mouse genes (GRCm39) GRCm39
> listEnsemblArchives()
name date url version
1 Ensembl GRCh37 Feb 2014 https://grch37.ensembl.org GRCh37
2 Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org 109
3 Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org 108
4 Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org 107
5 Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org 106
6 Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org 105
7 Ensembl 104 May 2021 https://may2021.archive.ensembl.org 104
attributes = listAttributes(ensembl)
attributes[1:25,]
#https://www.ncbi.nlm.nih.gov/grc/human
#BiocManager::install("org.Mmu.eg.db")
#library("org.Mmu.eg.db")
#edb <- org.Mmu.eg.db
#
#https://bioconductor.statistik.tu-dortmund.de/packages/3.6/data/annotation/
#EnsDb.Mmusculus.v79
#> query(hub, c("EnsDb", "apiens", "98"))
#columns(edb)
#searchAttributes(mart = ensembl, pattern = "symbol")
##https://www.geeksforgeeks.org/remove-duplicate-rows-based-on-multiple-columns-using-dplyr-in-r/
library(dplyr)
library(tidyverse)
#df <- data.frame (lang =c ('Java','C','Python','GO','RUST','Javascript',
'Cpp','Java','Julia','Typescript','Python','GO'),
value = c (21,21,3,5,180,9,12,20,6,0,3,6),
usage =c(21,21,0,99,44,48,53,16,6,8,0,6))
#distinct(df, lang, .keep_all= TRUE)
for (i in clist) {
#i<-clist[1]
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
#geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID"))
#geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID"))
# In the ENSEMBL-database, GENEID is ENSEMBL-ID.
#geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE")) # "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND"
#geness <- geness[!duplicated(geness$GENEID), ]
#using getBM replacing AnnotationDbi::select
#filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids.
geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = rownames(res),
mart = ensembl)
geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)
#merge by column by common colunmn name, in the case "GENEID"
res$ENSEMBL = rownames(res)
identical(rownames(res), geness_uniq$ensembl_gene_id)
res_df <- as.data.frame(res)
geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
dim(geness_res)
rownames(geness_res) <- geness_res$ensembl_gene_id
geness_res$ensembl_gene_id <- NULL
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
#-- show methods of class DESeq2 --
#x=capture.output(showMethods(class="DESeq2"))
#unlist(lapply(strsplit(x[grep("Function: ",x,)]," "),function(x) x[2]))
volcano plots with automatically finding top_g
#A canonical visualization for interpreting differential gene expression results is the volcano plot.
library(ggrepel)
for i in 1457.M10.2h_vs_uninfected.2h 1457.2h_vs_uninfected.2h 1457.M10.3d_vs_uninfected.3d 1457.3d_vs_uninfected.3d 1457.2h_vs_1457.M10.2h 1457.3d_vs_1457.M10.3d 1457.3d_vs_1457.2h 1457.M10.3d_vs_1457.M10.2h; do
#HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control
#for i in K3R_24hdox_vs_K3R_3hdox21hchase WT_3hdox21hchase_vs_K3R_3hdox21hchase; do
#for i in WT_24hdox_vs_K3R_24hdox; do
#for i in WT_24hdox_vs_WT_3hdox21hchase; do
# read files to geness_res
echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)"
echo "subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0))"
echo "geness_res\$Color <- \"NS or log2FC < 2.0\""
echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\""
echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\""
echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\""
echo "geness_res\$Color <- factor(geness_res\$Color, levels = c(\"NS or log2FC < 2.0\", \"P < 0.05\", \"P-adj < 0.05\"))"
echo "write.csv(geness_res, \"${i}_with_Category.csv\")"
# pick top genes for either side of volcano to label
# order genes for convenience:
echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)"
echo "top_g <- c()"
echo "top_g <- c(top_g, \
geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \
geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])"
echo "top_g <- unique(top_g)"
echo "geness_res <- geness_res[, -1*ncol(geness_res)]" # remove invert_P from matrix
# Graph results
echo "png(\"${i}.png\",width=1200, height=2000)"
echo "ggplot(geness_res, \
aes(x = log2FoldChange, y = -log10(pvalue), \
color = Color, label = external_gene_name)) + \
geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \
geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \
geom_point() + \
labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \
scale_color_manual(values = c(\"P-adj < 0.05\"=\"darkblue\",\"P < 0.05\"=\"lightblue\",\"NS or log2FC < 2.0\"=\"darkgray\"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = \"black\", min.segment.length = .1, box.padding = .2, lwd = 2) + \
theme_bw(base_size = 16) + \
theme(legend.position = \"bottom\")"
echo "dev.off()"
done
sed -i -e 's/Color/Category/g' *_Category.csv
for i in 1457.M10.2h_vs_uninfected.2h 1457.2h_vs_uninfected.2h 1457.M10.3d_vs_uninfected.3d 1457.3d_vs_uninfected.3d 1457.2h_vs_1457.M10.2h 1457.3d_vs_1457.M10.3d 1457.3d_vs_1457.2h 1457.M10.3d_vs_1457.M10.2h; do
echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
done
clustering the genes and draw heatmap
install.packages("gplots")
library("gplots")
for i in 1457.M10.2h_vs_uninfected.2h 1457.2h_vs_uninfected.2h 1457.M10.3d_vs_uninfected.3d 1457.3d_vs_uninfected.3d 1457.2h_vs_1457.M10.2h 1457.3d_vs_1457.M10.3d 1457.3d_vs_1457.2h 1457.M10.3d_vs_1457.M10.2h; do
echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
done
# 1 1457.2h_vs_1457.M10.2h-down.id
# 1 1457.2h_vs_1457.M10.2h-up.id
# 23 1457.2h_vs_uninfected.2h-down.id
# 74 1457.2h_vs_uninfected.2h-up.id
# 126 1457.3d_vs_1457.2h-down.id
# 61 1457.3d_vs_1457.2h-up.id
# 2 1457.3d_vs_1457.M10.3d-down.id
# 2 1457.3d_vs_1457.M10.3d-up.id
# 97 1457.3d_vs_uninfected.3d-down.id
# 79 1457.3d_vs_uninfected.3d-up.id
# 17 1457.M10.2h_vs_uninfected.2h-down.id
# 82 1457.M10.2h_vs_uninfected.2h-up.id
# 162 1457.M10.3d_vs_1457.M10.2h-down.id
# 69 1457.M10.3d_vs_1457.M10.2h-up.id
# 171 1457.M10.3d_vs_uninfected.3d-down.id
# 124 1457.M10.3d_vs_uninfected.3d-up.id
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id #570 genes
RNASeq.NoCellLine <- assay(rld)
# Defining the custom order
column_order <- c(
"uninfected.2h_r1", "uninfected.2h_r2", "uninfected.3h_r3",
"uninfected.3d_r1", "uninfected.3d_r2",
"1457-M10.2h_r1", "1457-M10.2h_r2", "1457-M10.2h_r3","1457.2h_r1", "1457.2h_r2", "1457.2h_r3",
"1457-M10.3d_r1", "1457-M10.3d_r2", "1457-M10.3d_r3","1457.3d_r1", "1457.3d_r2"
)
RNASeq.NoCellLine_reordered <- RNASeq.NoCellLine[, column_order]
head(RNASeq.NoCellLine_reordered)
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
datamat = RNASeq.NoCellLine_reordered[GOI, ]
write.csv(as.data.frame(datamat), file ="DEGs_heatmap_data.csv")
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.08)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, labRow="", srtCol=20, keysize=0.72, cexRow = 2, cexCol = 1.4)
dev.off()
# Extract rows from datamat where the row names match the identifiers in subset_1
#### cluster members #####
subset_1<-names(subset(mycl, mycl == '1'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_1, ]) #212
subset_2<-names(subset(mycl, mycl == '2'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_2, ]) #185
subset_3<-names(subset(mycl, mycl == '3'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_3, ]) #173
# Initialize an empty data frame for the annotated data
annotated_data <- data.frame()
# Determine total number of genes
total_genes <- length(rownames(data))
# Loop through each gene to annotate
for (i in 1:total_genes) {
gene <- rownames(data)[i]
result <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = gene,
mart = ensembl)
# If multiple rows are returned, take the first one
if (nrow(result) > 1) {
result <- result[1, ]
}
# Check if the result is empty
if (nrow(result) == 0) {
result <- data.frame(ensembl_gene_id = gene,
external_gene_name = NA,
gene_biotype = NA,
entrezgene_id = NA,
chromosome_name = NA,
start_position = NA,
end_position = NA,
strand = NA,
description = NA)
}
# Transpose expression values
expression_values <- t(data.frame(t(data[gene, ])))
colnames(expression_values) <- colnames(data)
# Combine gene information and expression data
combined_result <- cbind(result, expression_values)
# Append to the final dataframe
annotated_data <- rbind(annotated_data, combined_result)
# Print progress every 100 genes
if (i %% 100 == 0) {
cat(sprintf("Processed gene %d out of %d\n", i, total_genes))
}
}
# Save the annotated data to a new CSV file
write.csv(annotated_data, "cluster1_YELLOW.csv", row.names=FALSE)
write.csv(annotated_data, "cluster2_DARKBLUE.csv", row.names=FALSE)
write.csv(annotated_data, "cluster3_DARKORANGE.csv", row.names=FALSE)
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.csv -d',' -o gene_clusters.xls
点赞本文的读者
还没有人对此文章表态
没有评论
RNA-seq skin organoids on GRCh38+chrHsv1 (final)
RNAseq Analysis LT-K331A-LTtr-control
© 2023 XGenes.com Impressum