RNAseq processing for organoids

gene_x 0 like s 840 view s

Tags: pipeline, RNA-seq

install conda environment

#conda config --set auto_activate_base false

conda create --name rnaseq python=3.7

#NOTE: mamba 确实快多了,以后都用 mamba❕
#install packages
conda activate rnaseq
pip3 install deeptools
pip3 install multiqc
conda install -c bioconda stringtie subread gffread
conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots
conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger
conda install nextflow=23.04

conda install fq
conda install -c bioconda umi_tools
conda install -c bioconda rsem
conda install -c bioconda salmon

#conda install some tools
#install R-packages, 
conda install -c bioconda ucsc-bedclip
conda install -c bioconda ucsc-bedgraphtobigwig
conda install -c bioconda bioconductor-matrixgenerics
#conda install -c bioconda bioconductor-deseq2
conda install -c bioconda r-pheatmap
conda install -c anaconda gawk

conda install mamba -n base -c conda-forge
conda config --add channels conda-forge
mamba install -c bioconda salmon=1.10
#salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default.

conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2
mamba install -c bioconda samtools=1.9  
mamba install -c conda-forge r-optparse r-vctrs=0.5.0
conda install nextflow=23.04
mamba install -c bioconda qualimap
mamba install -c bioconda rseqc
mamba install -c conda-forge openssl
conda install -c bioconda ucsc-bedclip
conda install -c bioconda bedtools
conda update -c bioconda ucsc-bedclip
#for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory
conda update -c bioconda ucsc-bedgraphtobigwig
# samtools should be >= 1.9 as only those have the option @
#samtools sort \
#      -@ 6 \
#      -o HSV.d2_r1.sorted.bam \
#      -T HSV.d2_r1.sorted \
#      HSV.d2_r1.Aligned.out.bam

run UMItools without --umitools_dedup_stats, otherwise it cannot be finished in hamm.

Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (--edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.

Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.

#https://github.com/nf-core/rnaseq/issues/827 #INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html #INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/ #https://github.com/CGATOxford/UMI-tools/issues/173 # excessive dedup memory usage with output-stats #409 #https://github.com/CGATOxford/UMI-tools/issues/409 #umi_tools 1.0.1 #I am aware of previously closed issues: #excessive dedup memory usage #173 #speed up stats #184 #Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI. #Using --method=unique #There still seems to be a memory problem with --output-stats #Running with output-stats, memory usage climbs over 100GB and eventually crashes with "MemoryError". #Running without output-stats, job completes in about 3 minutes, with no problems.

  #TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100 
  #/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100

#umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4. #I suggest either making the --output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats.

  #--Human--
  #hamm
  /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P<umi_1>.{12}).*"         -profile docker     -resume  --max_cpus 54 --max_memory 120.GB --max_time 2400.h        --aligner 'star_salmon' --pseudo_aligner 'salmon'    --umitools_grouping_method 'unique'
  #--save_align_intermeds --save_unaligned --save_reference

  #sage
  nextflow run rnaseq/main.nf                --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P<umi_1>.{12}).*"         -profile test_full  -resume                --max_memory 256.GB --max_time 2400.h        --aligner 'star_salmon' --pseudo_aligner 'salmon'
  #--save_align_intermeds --save_unaligned --save_reference

  #--Virus--
  /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_virus    --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf"   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P<umi_1>.{12}).*" --umitools_dedup_stats    --skip_rseqc --skip_dupradar --skip_preseq    -profile test_full -resume  --max_cpus 55 --max_memory 120.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'hisat2'    --gtf_extra_attributes 'gene_name' --gtf_group_features 'gene_id' --featurecounts_group_type 'gene_name' --featurecounts_feature_type 'exon'    --umitools_grouping_method 'unique'

R-code for evaluation of nextflow outputs

# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)

library(tximport)
library(DESeq2)

setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon")

# Define paths to your Salmon output quantification files
files <- c("control_r1" = "./control_r1/quant.sf",
          "control_r2" = "./control_r2/quant.sf",
          "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
          "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
          "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
          "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
          "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
          "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
          "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
          "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")

# Import the transcript abundance data with tximport
txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)

# Define the replicates and condition of the samples
replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))

# Define the colData for DESeq2
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))

# Create DESeqDataSet object
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)

# In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
# Filter data to retain only genes with more than 2 counts > 3 across all samples
# dds <- dds[rowSums(counts(dds) > 3) > 2, ]

# Run DESeq2
dds <- DESeq(dds)

# Perform rlog transformation
rld <- rlogTransformation(dds)

# Output raw count data to a CSV file
write.csv(counts(dds), file="transcript_counts.csv")

# -- gene-level count data --
# Read in the tx2gene map from salmon_tx2gene.tsv
#tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)

# Set the column names
colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")

# Remove the gene_name column if not needed
tx2gene <- tx2gene[,1:2]

# Import and summarize the Salmon data with tximport
txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)

# Continue with the DESeq2 workflow as before...
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
#dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
dds <- DESeq(dds)
rld <- rlogTransformation(dds)
write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")

#TODO: why a lot of reads were removed due to the too_short?
#STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output

dim(counts(dds))
head(counts(dds), 10)

draw 3D PCA plots.

library(gplots) 
library("RColorBrewer")

library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)
write.csv(data, file="plotPCA_data.csv")
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
#df_pc <- data.frame(pc$x[,1:3])
df_pc <- data.frame(pc$x)
identical(rownames(data), rownames(df_pc)) #-->TRUE

data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL  # remove the "name" column
merged_df$name <- NULL
merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")]
write.csv(merged_df, file="merged_df_10PCs.csv")
summary(pc)  
#0.5333  0.2125 0.06852

draw_3D.py

# -- before pca --
png("pca_before_removeBatch2.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()

# -- before heatmap --
png("heatmap_before_removeBatch2.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()

mat <- assay(rld)
mm <- model.matrix(~replicates, colData(rld))
mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
assay(rld) <- mat

# -- after pca --
png("pca_after_removeBatch.png", 1200, 800)
#svg("pca_after_removeBatch.svg")
plotPCA(rld, intgroup=c("replicates"))
dev.off()

# -- after heatmap --
png("heatmap_after_removeBatch.png", 1200, 800)
#svg("heatmap_after_removeBatch.svg")
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()

(optional) estimate size factors

> head(dds)
class: DESeqDataSet 
dim: 6 10 
metadata(1): version
assays(6): counts avgTxLength ... H cooks
rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460
  ENSG00000000938
rowData names(34): baseMean baseVar ... deviance maxCooks
colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2
colData names(2): condition replicate

#convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
> sizeFactors(dds)

normalized_counts <- counts(dds, normalized=TRUE)
#write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)

# ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ----
nm <- assays(dds)[["avgTxLength"]]
sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm)

assays(dds)$counts  # for count data
assays(dds)$avgTxLength  # for average transcript length, etc.
assays(dds)$normalizationFactors

In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet.

To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
> assays(dds)
List of length 6
names(6): counts avgTxLength normalizationFactors mu H cooks

To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:

geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE)

# ---- DEBUG END ----

#unter konsole
#  control_r1  ...
# 1/0.9978755  ...

> sizeFactors(dds)
                    HeLa_TO_r1                      HeLa_TO_r2 
                      0.9978755                       1.1092227

1/0.9978755=1.002129023
1/1.1092227=

#bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023     --effectiveGenomeSize 2864785220
bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor  0.901532217        --effectiveGenomeSize 2864785220

differential expressions

#A central method for exploring differences between groups of segments or samples is to perform differential gene expression analysis.

dds$condition <- relevel(dds$condition, "control")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d2_vs_control","HSV.d4_vs_control","HSV.d6_vs_control","HSV.d8_vs_control")

dds$condition <- relevel(dds$condition, "HSV.d2")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d4_vs_HSV.d2","HSV.d6_vs_HSV.d2","HSV.d8_vs_HSV.d2")

dds$condition <- relevel(dds$condition, "HSV.d4")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d6_vs_HSV.d4","HSV.d8_vs_HSV.d4")

dds$condition <- relevel(dds$condition, "HSV.d6")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d8_vs_HSV.d6")

##https://bioconductor.statistik.tu-dortmund.de/packages/3.7/data/annotation/
#BiocManager::install("EnsDb.Mmusculus.v79")
#library(EnsDb.Mmusculus.v79)
#edb <- EnsDb.Mmusculus.v79

#https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
#https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
library(biomaRt)
listEnsembl()
listMarts()
#ensembl <- useEnsembl(biomart = "genes", mirror="asia")  # default is Mouse strains 104
#ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", mirror = "www")
#ensembl = useMart("ensembl_mart_44", dataset="hsapiens_gene_ensembl",archive=TRUE, mysql=TRUE)
#ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", version="104")
#ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="86")
#--> total 69, 27  GRCh38.p7 and 39  GRCm38.p4; we should take 104, since rnaseq-pipeline is also using annotation of 104!
ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104")
datasets <- listDatasets(ensembl)
#--> total 202   80                         GRCh38.p13         107                            GRCm39
#80           hsapiens_gene_ensembl                                      Human genes (GRCh38.p13)                         GRCh38.p13
#107         mmusculus_gene_ensembl                                        Mouse genes (GRCm39)                            GRCm39

> listEnsemblArchives()
            name     date                                 url version
1  Ensembl GRCh37 Feb 2014          https://grch37.ensembl.org  GRCh37
2     Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org     109
3     Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org     108
4     Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org     107
5     Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org     106
6     Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org     105
7     Ensembl 104 May 2021 https://may2021.archive.ensembl.org     104

attributes = listAttributes(ensembl)
attributes[1:25,]

#https://www.ncbi.nlm.nih.gov/grc/human
#BiocManager::install("org.Mmu.eg.db")
#library("org.Mmu.eg.db")
#edb <- org.Mmu.eg.db
#
#https://bioconductor.statistik.tu-dortmund.de/packages/3.6/data/annotation/
#EnsDb.Mmusculus.v79
#> query(hub, c("EnsDb", "apiens", "98"))
#columns(edb)

#searchAttributes(mart = ensembl, pattern = "symbol")

##https://www.geeksforgeeks.org/remove-duplicate-rows-based-on-multiple-columns-using-dplyr-in-r/
library(dplyr)
library(tidyverse)
#df <- data.frame (lang =c ('Java','C','Python','GO','RUST','Javascript',
                      'Cpp','Java','Julia','Typescript','Python','GO'),
                      value = c (21,21,3,5,180,9,12,20,6,0,3,6),
                      usage =c(21,21,0,99,44,48,53,16,6,8,0,6))
#distinct(df, lang, .keep_all= TRUE)

for (i in clist) {
#"HSV.d2_vs_control","HSV.d4_vs_control","HSV.d6_vs_control","HSV.d8_vs_control"
#i<-clist[1]
  contrast = paste("condition", i, sep="_")
  res = results(dds, name=contrast)
  res <- res[!is.na(res$log2FoldChange),]
  #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID"))
  #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID"))
  # In the ENSEMBL-database, GENEID is ENSEMBL-ID.
  #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE"))  #  "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND"
  #geness <- geness[!duplicated(geness$GENEID), ]

  #using getBM replacing AnnotationDbi::select
  #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids.
  geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
      filters = 'ensembl_gene_id',
      values = rownames(res), 
      mart = ensembl)
  geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)

  #merge by column by common colunmn name, in the case "GENEID"
  res$ENSEMBL = rownames(res)
  identical(rownames(res), geness_uniq$ensembl_gene_id)
  res_df <- as.data.frame(res)
  geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
  dim(geness_res)
  rownames(geness_res) <- geness_res$ensembl_gene_id
  geness_res$ensembl_gene_id <- NULL
  write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
  up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
  down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
  write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
  write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}

#-- show methods of class DESeq2 --
#x=capture.output(showMethods(class="DESeq2"))
#unlist(lapply(strsplit(x[grep("Function: ",x,)]," "),function(x) x[2]))

volcano plots with automatically finding top_g

#A canonical visualization for interpreting differential gene expression results is the volcano plot.
library(ggrepel)

for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
#HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control
#for i in K3R_24hdox_vs_K3R_3hdox21hchase WT_3hdox21hchase_vs_K3R_3hdox21hchase; do
#for i in WT_24hdox_vs_K3R_24hdox; do
#for i in WT_24hdox_vs_WT_3hdox21hchase; do
  # read files to geness_res
  echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)"

  echo "subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0))"
  echo "geness_res\$Color <- \"NS or log2FC < 2.0\""
  echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\""
  echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\""
  echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\""
  echo "geness_res\$Color <- factor(geness_res\$Color, levels = c(\"NS or log2FC < 2.0\", \"P < 0.05\", \"P-adj < 0.05\"))"
  echo "write.csv(geness_res, \"${i}_with_Category.csv\")"

  # pick top genes for either side of volcano to label
  # order genes for convenience:
  echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)"
  echo "top_g <- c()"
  echo "top_g <- c(top_g, \
            geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \
            geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])"
  echo "top_g <- unique(top_g)"
  echo "geness_res <- geness_res[, -1*ncol(geness_res)]"  # remove invert_P from matrix

  # Graph results
  echo "png(\"${i}.png\",width=1200, height=2000)"
  echo "ggplot(geness_res, \
      aes(x = log2FoldChange, y = -log10(pvalue), \
          color = Color, label = external_gene_name)) + \
      geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \
      geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \
      geom_point() + \
      labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \
      scale_color_manual(values = c(\"P-adj < 0.05\"=\"darkblue\",\"P < 0.05\"=\"lightblue\",\"NS or log2FC < 2.0\"=\"darkgray\"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \
      geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = \"black\", min.segment.length = .1, box.padding = .2, lwd = 2) + \
      theme_bw(base_size = 16) + \
      theme(legend.position = \"bottom\")"
  echo "dev.off()"
done

sed -i -e 's/Color/Category/g' *_Category.csv

for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
  echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
done

clustering the genes and draw heatmap

install.packages("gplots")
library("gplots")

for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
  echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
  echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
done

    5 HSV.d2_vs_control-down.id
    14 HSV.d2_vs_control-up.id
    77 HSV.d4_vs_control-down.id
  460 HSV.d4_vs_control-up.id

  977 HSV.d6_vs_control-down.id
  1863 HSV.d6_vs_control-up.id
  1361 HSV.d8_vs_control-down.id
  1215 HSV.d8_vs_control-up.id

    35 HSV.d4_vs_HSV.d2-down.id
  205 HSV.d4_vs_HSV.d2-up.id

  832 HSV.d6_vs_HSV.d2-down.id
  1550 HSV.d6_vs_HSV.d2-up.id
  386 HSV.d6_vs_HSV.d4-down.id
  103 HSV.d6_vs_HSV.d4-up.id

  1136 HSV.d8_vs_HSV.d2-down.id
  1050 HSV.d8_vs_HSV.d2-up.id
  598 HSV.d8_vs_HSV.d4-down.id
  292 HSV.d8_vs_HSV.d4-up.id
  305 HSV.d8_vs_HSV.d6-down.id
  133 HSV.d8_vs_HSV.d6-up.id
12597 total

cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id
RNASeq.NoCellLine <- assay(rld)

#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
datamat = RNASeq.NoCellLine[GOI, ]
write.csv(as.data.frame(datamat), file ="significant_gene_expressions.txt")
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.05)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");

mycol = mycol[as.vector(mycl)]
sampleCols <- rep('GREY',ncol(datamat))
names(sampleCols) <- c("control r1","control r2","day2 r1","day2 r2","day4 r1","day4 r2", "day6 r1","day6 r2", "day8 r1","day8 r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'

sampleCols["control r1"] <- 'DARKBLUE'
sampleCols["control r2"] <- 'DARKBLUE'
sampleCols["day2 r1"] <- 'DARKRED'
sampleCols["day2 r2"] <- 'DARKRED'
sampleCols["day4 r1"] <- 'DARKORANGE'
sampleCols["day4 r2"] <- 'DARKORANGE'
sampleCols["day6 r1"] <- 'DARKGREEN'
sampleCols["day6 r2"] <- 'DARKGREEN'
sampleCols["day8 r1"] <- 'DARKCYAN'
sampleCols["day8 r2"] <- 'DARKCYAN'

png("DEGs_heatmap.png", width=1000, height=1200)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
            scale='row',trace='none',col=bluered(75), 
            RowSideColors = mycol, ColSideColors = sampleCols, labRow="", margins=c(22,10), cexRow=8, cexCol=2, srtCol=20, lwid=c(1,7), lhei = c(1, 8))
#legend("top", title = "",legend=c("WaGa_RNA","MKL1_RNA","WaGa_EV_RNA","MKL1_EV_RNA"), fill=c("DARKBLUE","DARKRED","DARKORANGE","DARKGREEN"), cex=0.8, box.lty=0)
dev.off()

#c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt') 
write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt')  
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls

~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_expression_data.xls;

like unlike

点赞本文的读者

还没有人对此文章表态

本文有评论

没有评论

RNAseq processing for organoids

本文有评论

看文章，发评论，不要沉默

最受欢迎文章

最新文章

最多评论文章

推荐相似文章