gene_x 0 like s 558 view s
Tags: NGS, pipeline, RNA-seq
install conda environment rnaseq_2021
conda create -n rnaseq_2021 python=3.6.7
conda activate rnaseq_2021
conda install -c conda-forge -c bioconda -c defaults nextflow=21.04 fastqc=0.11.8 trim-galore=0.5.0 star=2.6.1d hisat2=2.1.0
conda install -c conda-forge -c bioconda -c defaults picard=2.18.27 csvtk=0.17.0 preseq=2.0.3
conda install -c conda-forge -c bioconda -c defaults samtools=1.9
conda install -c conda-forge -c bioconda -c defaults gffread=0.9.12
conda install -c conda-forge -c bioconda -c defaults subread=1.6.4
conda install -c conda-forge -c bioconda -c defaults deeptools=3.2.0
conda install -c conda-forge -c bioconda -c defaults multiqc=1.7 #*
conda install -c conda-forge -c bioconda -c defaults conda-forge::r-data.table=1.12.0 conda-forge::r-gplots=3.0.1.1 bioconductor-dupradar=1.12.1 bioconductor-edger=3.24.1
conda install -c conda-forge -c bioconda -c defaults stringtie=1.3.5
conda install -c conda-forge -c bioconda -c defaults rseqc=3.0.0
prepare the virus gtf-file
# -- processing for virus gtf --
# #gffread X14112.1_gene.gff -T -o X14112.1_gene.gtf
# cp X14112.1.gff X14112.1.gff3
# #gffread -E -F --bed X14112.1.gff3 -o X14112.1.bed (change the name errors in 1 intron and 2 genes)
# grep "^##" X14112.1.gff3 > X14112.1_gene.gff3
#
# # --try to filter the file with genes --> failed --
# grep "ID=gene" X14112.1.gff3 >> X14112.1_gene.gff
# cp X14112.1.gff3 X14112.1.gff
# -- generating *_gene.gtf file containing only gene records --
python3 gff2gtf.py
# -- check if gene_id is unique --
cut -f9- -d$'\t' X14112.1_gene.gtf > temp
cut -f1 -d';' temp > temp_ #111
sort temp_ > temp_1
sort temp_ -u > temp_2
diff temp_1 temp_2
#39d38
#< ID=gene-UL29
#59d57
#< ID=gene-UL43
#--> delete short ones of the repeated records --> 109 records
python3 extends.py #generating the file X14112.1_gene_extended.gtf
#Then replace 'transcript_id "exon' --> 'transcript_id "rna' in X14112.1_gene_extended.gtf
gffread -E -F --bed X14112.1_gene_extended.gtf -o X14112.1_gene_extended.bed
#-->bed contains 109 transcript-name for example "rna-gene-RS1-2"
##!!!!OPTIONAL!!!!: don't need to change type '\tgene\t' to '\texon\t', since X14112.1_gene_extended.gtf contains exon-records.
nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "K14112.1.fasta" --gtf "K14112.1_gene_extended.gtf" --bed12 "K14112.1_gene_extendced.bed" --singleEnd -profile standard --aligner star --fcGroupFeaturesType gene_biotype --skip_genebody_coverage -resume --saveReference
# -- correct some special records (optional, as the processing above didn't genrate the records) --
# delete the lines starting with "#"
# replace "X14112.1:146805..151063" to IE175
# replace "X14112.1:133941..146107" to IE68
# add ;gene=IE68 ;gene=IE175 to the corresponding lines
# -- python code for convert gff to gtf --
# open the input file for reading and the output file for writing
# -- scripts choose gene or exon or mRNA --
# python3 gff2gtf.py #X14112.1.gff-->X14112.1.gtf
# replace '; transcript_id "gene' to '; transcript_id "tx'
# !!!!VERY_IMPORTANT!!!!: change type '\tgene\t' to '\texon\t'!
# sed -i -e "s/\tgene\t/\texon\t/g" X14112.1_gene.gff # since default is --featurecounts_feature_type 'exon'.
nextflow command: the input should be *.umi_extract.fastq.gz.
#SUCCESSFUL
(rnaseq) jhuang@hamburg:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ /home/jhuang/anaconda3/bin/nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner hisat2 --fcGroupFeaturesType gene_biotype --skip_genebody_coverage --skip_preseq -resume --saveReference
#NOT_TESTED
(rnaseq_2021) jhuang@hamm:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner star -resume --saveReference
snippet of the human hg38 gtf served as a pattern
1 ensembl_havana gene 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
1 ensembl_havana transcript 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
1 ensembl_havana exon 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; exon_id "ENSE00002324228"; exon_version "3"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
1 ensembl_havana CDS 685719 686654 . - 0 gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; protein_id "ENSP00000329982"; protein_version "2"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
1 ensembl_havana gene 1211340 1214153 . - . gene_id "ENSG00000186827"; gene_version "11"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
1 havana transcript 1211340 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2";
1 havana exon 1213983 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2";
1 havana exon 1212992 1213785 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2";
1 havana exon 1212638 1212704 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2";
1 havana exon 1211942 1212138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2";
1 ensembl_havana exon 1211704 1211832 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001333051"; exon_v
ersion "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
1 ensembl_havana CDS 1211704 1211832 . - 2 gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
1 ensembl_havana exon 1211340 1211625 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001915458"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
1 ensembl_havana CDS 1211558 1211625 . - 2 gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
1 ensembl_havana stop_codon 1211555 1211557 . - 0 gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
1 ensembl_havana five_prime_utr 1214128 1214153 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
1 ensembl_havana three_prime_utr 1211340 1211554 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
1 havana transcript 1211340 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2";
1 havana exon 1213983 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2";
1 havana exon 1212992 1213785 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203
"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2";
1 havana exon 1212638 1212704 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2";
1 havana exon 1211942 1212138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2";
1 havana exon 1211340 1211832 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001923078"; exon_version "1"; transcript_support_level "2";
1 havana transcript 1212019 1213498 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "3";
1 havana exon 1213395 1213498 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001680308"; exon_version "1"; transcript_support_level "3";
1 havana exon 1212992 1213093 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003495433"; exon_version "1"; transcript_support_level "3";
1 havana exon 1212638 1212704 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "3";
1 havana exon 1212019 1212138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001723250"; exon_version "1"; transcript_support_level "3";
generate wig files
samtools faidx X14112.1.fasta
cut -f1,2 X14112.1.fasta.fai > results/markDuplicates/chrom.sizes
cd results/markDuplicates/
for sample in control_r1 control_r2 HSV.d2_r1 HSV.d2_r2 HSV.d4_r1 HSV.d4_r2 HSV.d6_r1 HSV.d6_r2 HSV.d8_r1 HSV.d8_r2; do
#bamCoverage -b ${sample}.umi_extract.sorted.markDups.bam -o ${sample}.bw
#bedtools genomecov -ibam ${sample}.umi_extract.sorted.markDups.bam -bg > ${sample}.bedgraph
bedGraphToBigWig ${sample}.bedgraph chrom.sizes ${sample}.bw
bigWigToWig ${sample}.bw ${sample}.wig
done
input and clean data using R
#BiocManager::install(c("DESeq2"))
requiredPackages1 <-c("AnnotationDbi","clusterProfiler","ReactomePA","org.Hs.eg.db","DESeq2", "gplots", "RColorBrewer")
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
ipak(requiredPackages1)
#requiredPackages2 <- c("tidyverse")
#ipak(requiredPackages2)
#cut -f1-1 merged_gene_counts.txt > col1.txt
#paste -d$'\t' col1.txt merged_gene_counts2.txt > merged_gene_counts3.txt
#sed -i 's/gene-//g' merged_gene_counts3.txt
#replace "X14112.1" to "X14112"; delete "rna-gene-"; get X14112.1_gene_extended2.gtf; using it in IGV
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts")
#---- dataset (27) samples (firstly import all samples, then spring to 27-3-1-2) ----
d.raw<- read.delim2("merged_gene_counts3.txt",sep="\t", header=TRUE, row.names=1)
#> head(d.raw,0)
# [1] HSV.d4_r2.umi_extract.sorted.bam HSV.d6_r1.umi_extract.sorted.bam
# [3] HSV.d4_r1.umi_extract.sorted.bam control_r1.umi_extract.sorted.bam
# [5] HSV.d2_r2.umi_extract.sorted.bam control_r2.umi_extract.sorted.bam
# [7] HSV.d2_r1.umi_extract.sorted.bam HSV.d8_r2.umi_extract.sorted.bam
# [9] HSV.d8_r1.umi_extract.sorted.bam HSV.d6_r2.umi_extract.sorted.bam
col.order <- c("control_r1.umi_extract.sorted.bam","control_r2.umi_extract.sorted.bam", "HSV.d2_r1.umi_extract.sorted.bam","HSV.d2_r2.umi_extract.sorted.bam", "HSV.d4_r1.umi_extract.sorted.bam","HSV.d4_r2.umi_extract.sorted.bam", "HSV.d6_r1.umi_extract.sorted.bam","HSV.d6_r2.umi_extract.sorted.bam", "HSV.d8_r1.umi_extract.sorted.bam","HSV.d8_r2.umi_extract.sorted.bam")
d <- d.raw[,col.order] #reordered.raw
#d <- reordered.raw[rowSums(reordered.raw>3)>2,]
colnames(d) = c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2")
# Define the replicates and condition of the samples
ids <- factor(c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2"))
replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
# Construct the DESeqDataSet
cData = data.frame(row.names=colnames(d), replicate=replicate, condition=condition, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~condition)
# Run DESeq (early without the step, WRONG?)
dds <- DESeq(dds)
# Apply the rlog transformation
rld <- rlogTransformation(dds)
#rld <- vst(dds) #vsd
#-- save raw_data as xls --
write.csv(d, file="d.csv")
#~/Tools/csv2xls-0.4/csv_to_xls.py d.csv -d$',' -o d.xls
plotting pca and heatmap and remove batchEffect
# -- pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
#plotPCA(rld, intgroup = c("condition", "batch"))
#plotPCA(rld, intgroup = c("condition", "ids"))
#plotPCA(rld, "batch")
dev.off()
# -- heatmap --
## generate the pairwise comparison between samples
library(gplots)
library("RColorBrewer")
png("heatmap.png", 1200, 800)
distsRL <- dist(t(assay(rld)))
mat <- as.matrix(distsRL)
#paste( rld$dex, rld$cell, sep="-" )
#rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,batch, sep=":"))
rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,ids, sep=":"))
hc <- hclust(distsRL)
hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
dev.off()
select the differentially expressed genes
#https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
#https://www.biostars.org/p/282295/
#https://www.biostars.org/p/335751/
#> condition
# [1] control control HSV.d2 HSV.d2 HSV.d4 HSV.d4 HSV.d6 HSV.d6 HSV.d8 HSV.d8
#Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8
#CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes
setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes")
#---- relevel to control ----
dds$condition <- relevel(dds$condition, "control")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control")
dds$condition <- relevel(dds$condition, "HSV.d2")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2")
dds$condition <- relevel(dds$condition, "HSV.d4")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4")
dds$condition <- relevel(dds$condition, "HSV.d6")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("HSV.d8_vs_HSV.d6")
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
res_df <- as.data.frame(res)
write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
up <- subset(res_df, padj<=0.05 & log2FoldChange>=1.2)
down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1.2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
##https://github.com/kevinblighe/EnhancedVolcano
#BiocManager::install("EnhancedVolcano")
library("EnhancedVolcano")
#for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control; do
#for i in HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2; do
#for i in HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4; do
for i in HSV.d8_vs_HSV.d6; do
echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")"
echo "res = results(dds, name=contrast)"
#echo "res <- res[!is.na(res$log2FoldChange),]"
echo "res <- na.omit(res)"
echo "res_df <- as.data.frame(res)"
#selectLab = selectLab_italics,
echo "png(\"${i}.png\",width=1200, height=1000)"
#legendPosition = 'right',legendLabSize = 12, arrowheads = FALSE,
echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
echo "dev.off()"
done
#under DIR degenes under KONSOLE
for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
done
9 (optional). clustering the genes and draw heatmap
install.packages("gplots")
library("gplots")
for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
done
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id
RNASeq.NoCellLine <- assay(rld)
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
datamat = RNASeq.NoCellLine[GOI, ]
write.csv(as.data.frame(datamat), file ="significant_gene_expressions.txt")
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.5)
mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
mycol = mycol[as.vector(mycl)]
#png("DEGs_heatmap.png", width=900, height=800)
#cex.lab=10, labRow="",
png("DEGs_heatmap.png", width=900, height=1000)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8)) #rownames(datamat)
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
dev.off()
#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_expression_data.xls;
点赞本文的读者
还没有人对此文章表态
没有评论
Install and configure Docker for 'nextflow run nf-core/rnaseq'
RNA-seq 2024 Ute from raw counts
Preparing a GTF file from GenBank for bacterial RNA-seq analysis, using the example of WA
© 2023 XGenes.com Impressum