gene_x 0 like s 569 view s
Tags: python, R, pipeline, RNA-seq
nextflow
(rnaseq) [jhuang@sage Data_Denise_LT_RNAseq]$ nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full -resume --max_memory 256.GB --max_time 2400.h --save_align_intermeds --save_unaligned --aligner 'star_salmon' --skip_multiqc
ln -s ~/Tools/rnaseq/assets/multiqc_config.yaml multiqc_config.yaml
multiqc -f --config multiqc_config.yaml . 2>&1
rm multiqc_config.yaml
construct DESeqDataSet
# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)
library(tximport)
library(DESeq2)
setwd("~/DATA/Data_Denise_LT_RNASeq/results_GRCh38/star_salmon")
# Define paths to your Salmon output quantification files, quant.sf refers to tx-counts, later will be summaried as gene-counts.
files <- c("control_DI" = "./control_d8_DI/quant.sf",
"control_DII" = "./control_d8_DII/quant.sf",
#"control_DII" = "./control_d8_DII_re/quant.sf",
"LT_DI" = "./LT_d8_DI/quant.sf",
"LT_DII" = "./LT_d8_DII/quant.sf",
"LTtr_DI" = "./LTtr_d8_DI/quant.sf",
"LTtr_DII" = "./LTtr_d8_DII/quant.sf",
"LT_K331A_DI" = "./LT_K331A_d8_DI/quant.sf",
#"LT_K331A_DII" = "./LT_K331A_d8_DII/quant.sf",
"LT_K331A_DII" = "./LT_K331A_d8_DII_re/quant.sf")
# ---- tx-level count data ----
# Import the transcript abundance data with tximport
txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
# Define the replicates (or donors) and condition of the samples
#donor <- factor(c("DI", "DII", "DII", "DI", "DII", "DI", "DII", "DI", "DII", "DII"))
#batch <- factor(c("batch1", "batch1", "batch2", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2"))
#condition <- factor(c("control", "control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A", "LT_K331A"))
donor <- factor(c("DI", "DII", "DI", "DII", "DI", "DII", "DI", "DII"))
batch <- factor(c("batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2"))
condition <- factor(c("control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A"))
# Output raw count data to a CSV file
write.csv(counts(dds), file="transcript_counts.csv")
# ---- gene-level count data ----
# Read in the tx2gene map from salmon_tx2gene.tsv
#tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
# Set the column names
colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
# Remove the gene_name column if not needed
tx2gene <- tx2gene[,1:2]
# Import and summarize the Salmon data with tximport
txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
# Continue with the DESeq2 workflow as before...
colData <- data.frame(donor=donor, batch=batch, condition=condition, row.names=names(files))
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~donor+condition)
#dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543
dds <- DESeq(dds)
rld <- rlogTransformation(dds)
write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
dim(counts(dds))
head(counts(dds), 10)
draw 3D PCA plots.
library(gplots)
library("RColorBrewer")
library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
write.csv(data, file="plotPCA_data.csv")
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
#df_pc <- data.frame(pc$x[,1:3])
df_pc <- data.frame(pc$x)
identical(rownames(data), rownames(df_pc)) #-->TRUE
data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL # remove the "name" column
merged_df$name <- NULL
merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
write.csv(merged_df, file="merged_df_8PCs.csv")
summary(pc)
#0.3311 0.2376 0.1247
#0.3637 0.2564 0.1184 --> 0.36, 026, 0,12
draw_3D.py
draw_3D.py
import plotly.graph_objects as go
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
from scipy.linalg import eigh, sqrtm
# Read in data as a pandas dataframe
#df = pd.DataFrame({
# 'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
# 'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
# 'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
# 'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
# 'donor': ['DI', 'DII', 'DI', 'DII', 'DI']
#})
df = pd.read_csv('merged_df_8PCs.csv', index_col=0, header=0)
df['condition'] = df['condition'].replace("control", "ctrl d8")
df['condition'] = df['condition'].replace("LTtr", "LTtr d8")
df['condition'] = df['condition'].replace("LT", "LT d8")
df['condition'] = df['condition'].replace("LT_K331A", "LT K331A d8")
# Fit PCA model to reduce data dimensions to 3
pca = PCA(n_components=3)
pca.fit(df.iloc[:, :-3])
X_reduced = pca.transform(df.iloc[:, :-3])
# Add reduced data back to dataframe
df['PC1'] = X_reduced[:, 0]
df['PC2'] = X_reduced[:, 1]
df['PC3'] = X_reduced[:, 2]
# Create PCA plot with 3D scatter
fig = go.Figure()
#['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x']
# if donor == 'DI' else marker=dict(size=2, opacity=0.8, color=condition_color, symbol=donor_symbol)
#decrease diamond size to 6 while keep the circle as size 10 in the following code:
#'rgb(128, 150, 128)'
#I need three families of colors, always from light to deep, the first one should close to grey.
#the first serie for 'ctrl LTtr+sT d9/12', 'LTtr+sT d9/12'
#the second serie for 'ctrl LT/LTtr d3', 'ctrl LT/LTtr d8', 'LT d3', 'LT d8', 'LTtr d3', 'LTtr d8'
#the third serie for 'ctrl sT d3', 'ctrl sT d8', 'sT d3', 'sT d8', 'sT+LT d3'
condition_color_map_untreated = {'untreated':'black'}
donor_symbol_map_untreated = {'DI': 'circle-open', 'DII': 'diamond-open'}
#condition_color_map = {'ctrl LTtr+sT d9/12': 'green', 'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'}
condition_color_map = {
'ctrl d8': '#A9A9A9',
'LT d8': '#a6cee3',
'LT K331A d8': '#1f78b4',
'LTtr d8': '#e31a1c'
}
donor_symbol_map = {'DI': 'circle', 'DII': 'diamond'}
for donor, donor_symbol in donor_symbol_map_untreated.items():
for condition, condition_color in condition_color_map_untreated.items():
mask = (df['condition'] == condition) & (df['donor'] == donor)
fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
mode='markers',
name=f'{condition}' if donor == 'DI' else None,
legendgroup=f'{condition}',
showlegend=True if donor == 'DI' else False,
marker=dict(size=4 if donor_symbol in ['diamond-open'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))
for donor, donor_symbol in donor_symbol_map.items():
for condition, condition_color in condition_color_map.items():
mask = (df['condition'] == condition) & (df['donor'] == donor)
fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
mode='markers',
name=f'{condition}' if donor == 'DI' else None,
legendgroup=f'{condition}',
showlegend=True if donor == 'DI' else False,
marker=dict(size=4 if donor_symbol in ['diamond'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol)))
for donor, donor_symbol in donor_symbol_map.items():
fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
mode='markers',
name=donor,
legendgroup=f'{donor}',
showlegend=True,
marker=dict(size=6, opacity=1, color='black', symbol=donor_symbol),
hoverinfo='none'))
# Annotations for the legend blocks
fig.update_layout(
annotations=[
dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
text='Condition', font=dict(size=15)),
dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
text='Donor', font=dict(size=15)),
dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False,
text='PC3: 12% variance', font=dict(size=15), textangle=-90)
],
scene=dict(
xaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC1: 36% variance'),
yaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC2: 26% v.'),
zaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title=''),
bgcolor='white'
),
margin=dict(l=0, r=0, b=0, t=0) # Adjust the margins to prevent clipping of axis titles
)
#0.3311 0.2376 0.1247
#summary(pc) #--> Proportion of Variance 0.3647 0.1731 0.1515
#percentVar <- round(100 * attr(data, "percentVar"))
#percentVar <- c(36,17,15)
#fig.show()
fig.write_image("PCA_3D.svg")
draw 2D PCA plots.
library(gplots)
library("RColorBrewer")
#mat <- assay(rld)
#mm <- model.matrix(~condition, colData(rld))
#mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
#assay(rld) <- mat
# -- pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()
# -- heatmap --
png("heatmap.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()
differential expressions
dds$condition <- relevel(dds$condition, "control")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LT_K331A_vs_control", "LT_vs_control", "LTtr_vs_control")
dds$condition <- relevel(dds$condition, "LT")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LT_K331A_vs_LT")
library(dplyr)
library(tidyverse)
library(biomaRt)
listEnsembl()
listMarts()
ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
datasets <- listDatasets(ensembl)
attributes = listAttributes(ensembl)
attributes[1:25,]
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = rownames(res),
mart = ensembl)
geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)
res$ENSEMBL = rownames(res)
identical(rownames(res), geness_uniq$ensembl_gene_id)
res_df <- as.data.frame(res)
geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
dim(geness_res)
rownames(geness_res) <- geness_res$ensembl_gene_id
geness_res$ensembl_gene_id <- NULL
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
volcano plots with automatically finding top_g
#sorted_geness_res <- geness_res %>% arrange(desc(log2FoldChange))
#CSF3, ADORA2A, RPS26P6, EHF, TNFRSF6B
#sorted_geness_res <- geness_res %>% arrange(log2FoldChange)
library(ggrepel)
geness_res <- read.csv(file = "LT_K331A_vs_control-all.txt", sep=",", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:1000],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:1000]))
# Define the original and compressed ranges
original_range <- c(18, 30)
compressed_range <- c(18.0, 22.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 15, by=5)
y_breaks_compressed <- c(18.0, 22.0)
y_breaks_above <- c(27.0)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 15, by=5)
y_labels_compressed <- c(18, 30)
y_labels_above <- c(35)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("LT_K331A_vs_control.png", width=1000, height=1000)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)),
size = 6,
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 22) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
geness_res <- read.csv(file = "LT_vs_control-all.txt", sep=",", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
# Define the original and compressed ranges
original_range <- c(18, 38)
compressed_range <- c(18.0, 22.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 15, by=5)
y_breaks_compressed <- c(18.0, 22.0)
y_breaks_above <- c(27.0)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 15, by=5)
y_labels_compressed <- c(18, 38)
y_labels_above <- c(43)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("LT_vs_control.png", width=1000, height=1000)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)),
size = 6,
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 22) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
geness_res <- read.csv(file = "LTtr_vs_control-all.txt", sep=",", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
# Define the original and compressed ranges
original_range <- c(15, 25)
compressed_range <- c(15.0, 18.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 10, by=5)
y_breaks_compressed <- c(15.0, 18.0)
y_breaks_above <- c(23.0)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 10, by=5)
y_labels_compressed <- c(15, 25)
y_labels_above <- c(30)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("LTtr_vs_control.png", width=1000, height=1000)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)),
size = 6,
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 22) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
geness_res <- read.csv(file = "LT_K331A_vs_LT-all.txt", sep=",", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400]))
# Define the original and compressed ranges
original_range <- c(10, 11)
compressed_range <- c(10.0, 11.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Create the plot
png("LT_K331A_vs_LT.png", width=1000, height=1000)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)),
size = 6,
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 22) +
theme(legend.position = "bottom")
dev.off()
for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do
echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
done
clustering the genes and draw heatmap
install.packages("gplots")
library("gplots")
for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do
echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
done
# 307 LT_K331A_vs_control-down.id
# 667 LT_K331A_vs_control-up.id
# 66 LT_K331A_vs_LT-down.id
# 71 LT_K331A_vs_LT-up.id
# 157 LTtr_vs_control-down.id
# 484 LTtr_vs_control-up.id
# 379 LT_vs_control-down.id
# 749 LT_vs_control-up.id
# 2880 total
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id #570 genes
RNASeq.NoCellLine <- assay(rld)
# Defining the custom order
column_order <- c(
"control_DI", "control_DII", "LTtr_DI", "LTtr_DII", "LT_DI", "LT_DII", "LT_K331A_DI", "LT_K331A_DII"
)
RNASeq.NoCellLine_reordered <- RNASeq.NoCellLine[, column_order]
head(RNASeq.NoCellLine_reordered)
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
datamat = RNASeq.NoCellLine_reordered[GOI, ]
write.csv(as.data.frame(datamat), file ="DEGs_heatmap_data.csv")
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.05)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, labRow="", srtCol=20, keysize=0.72, cexRow = 2, cexCol = 1.4)
dev.off()
# Extract rows from datamat where the row names match the identifiers in subset_1
#### cluster members #####
subset_1<-names(subset(mycl, mycl == '1'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_1, ]) #579
subset_2<-names(subset(mycl, mycl == '2'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_2, ]) #399
subset_3<-names(subset(mycl, mycl == '3'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_3, ]) #70
subset_4<-names(subset(mycl, mycl == '4'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_4, ]) #433
subset_5<-names(subset(mycl, mycl == '5'))
data <- as.data.frame(datamat[rownames(datamat) %in% subset_5, ]) #140
# Initialize an empty data frame for the annotated data
annotated_data <- data.frame()
# Determine total number of genes
total_genes <- length(rownames(data))
# Loop through each gene to annotate
for (i in 1:total_genes) {
gene <- rownames(data)[i]
result <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
filters = 'ensembl_gene_id',
values = gene,
mart = ensembl)
# If multiple rows are returned, take the first one
if (nrow(result) > 1) {
result <- result[1, ]
}
# Check if the result is empty
if (nrow(result) == 0) {
result <- data.frame(ensembl_gene_id = gene,
external_gene_name = NA,
gene_biotype = NA,
entrezgene_id = NA,
chromosome_name = NA,
start_position = NA,
end_position = NA,
strand = NA,
description = NA)
}
# Transpose expression values
expression_values <- t(data.frame(t(data[gene, ])))
colnames(expression_values) <- colnames(data)
# Combine gene information and expression data
combined_result <- cbind(result, expression_values)
# Append to the final dataframe
annotated_data <- rbind(annotated_data, combined_result)
# Print progress every 100 genes
if (i %% 100 == 0) {
cat(sprintf("Processed gene %d out of %d\n", i, total_genes))
}
}
# Save the annotated data to a new CSV file
write.csv(annotated_data, "cluster1_YELLOW.csv", row.names=FALSE)
write.csv(annotated_data, "cluster2_DARKBLUE.csv", row.names=FALSE)
write.csv(annotated_data, "cluster3_DARKORANGE.csv", row.names=FALSE)
write.csv(annotated_data, "cluster4_DARKMAGENTA.csv", row.names=FALSE)
write.csv(annotated_data, "cluster5_DARKCYAN.csv", row.names=FALSE)
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.csv -d',' -o DEGs_heatmap_clusters.xls
点赞本文的读者
还没有人对此文章表态
没有评论
RNA-seq skin organoids on GRCh38+chrHsv1 (final)
Small RNA sequencing processing in the example of smallRNA_7
RNAseq Analysis Overview of Osteoblasts Infected with S. epidermidis Strains
© 2023 XGenes.com Impressum