prepare virus X14112 gtf for nextflow running

gene_x 0 like s 460 view s

Tags: NGS, pipeline, RNA-seq

  1. install conda environment rnaseq_2021

    conda create -n rnaseq_2021 python=3.6.7
    conda activate rnaseq_2021
    conda install -c conda-forge -c bioconda -c defaults  nextflow=21.04 fastqc=0.11.8 trim-galore=0.5.0 star=2.6.1d hisat2=2.1.0
    conda install -c conda-forge -c bioconda -c defaults  picard=2.18.27 csvtk=0.17.0 preseq=2.0.3 
    conda install -c conda-forge -c bioconda -c defaults  samtools=1.9
    conda install -c conda-forge -c bioconda -c defaults  gffread=0.9.12
    conda install -c conda-forge -c bioconda -c defaults  subread=1.6.4
    conda install -c conda-forge -c bioconda -c defaults  deeptools=3.2.0
    conda install -c conda-forge -c bioconda -c defaults  multiqc=1.7    #*
    conda install -c conda-forge -c bioconda -c defaults  conda-forge::r-data.table=1.12.0 conda-forge::r-gplots=3.0.1.1 bioconductor-dupradar=1.12.1 bioconductor-edger=3.24.1 
    conda install -c conda-forge -c bioconda -c defaults  stringtie=1.3.5
    conda install -c conda-forge -c bioconda -c defaults  rseqc=3.0.0
    
  2. prepare the virus gtf-file

    # -- processing for virus gtf --
    # #gffread X14112.1_gene.gff -T -o X14112.1_gene.gtf
    # cp X14112.1.gff X14112.1.gff3
    # #gffread -E -F --bed X14112.1.gff3 -o X14112.1.bed (change the name errors in 1 intron and 2 genes)
    # grep "^##" X14112.1.gff3 > X14112.1_gene.gff3
    # 
    # # --try to filter the file with genes --> failed --
    # grep "ID=gene" X14112.1.gff3 >> X14112.1_gene.gff
    # cp X14112.1.gff3 X14112.1.gff
    
    # -- generating *_gene.gtf file containing only gene records --
    python3 gff2gtf.py
    # -- check if gene_id is unique --
    cut -f9- -d$'\t' X14112.1_gene.gtf > temp
    cut -f1 -d';' temp > temp_  #111
    sort temp_ > temp_1
    sort temp_ -u > temp_2
    diff temp_1 temp_2
    #39d38
    #< ID=gene-UL29
    #59d57
    #< ID=gene-UL43
    #--> delete short ones of the repeated records --> 109 records
    
    python3 extends.py #generating the file X14112.1_gene_extended.gtf
    #Then replace 'transcript_id "exon' --> 'transcript_id "rna' in X14112.1_gene_extended.gtf
    
    gffread -E -F --bed X14112.1_gene_extended.gtf -o X14112.1_gene_extended.bed
    #-->bed contains 109 transcript-name for example "rna-gene-RS1-2"
    
    ##!!!!OPTIONAL!!!!: don't need to change type '\tgene\t' to '\texon\t', since X14112.1_gene_extended.gtf contains exon-records.
    nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "K14112.1.fasta" --gtf "K14112.1_gene_extended.gtf" --bed12 "K14112.1_gene_extendced.bed"  --singleEnd -profile standard --aligner star --fcGroupFeaturesType gene_biotype --skip_genebody_coverage -resume --saveReference
    
    # -- correct some special records (optional, as the processing above didn't genrate the records) --
    # delete the lines starting with "#"
    # replace "X14112.1:146805..151063" to IE175
    # replace "X14112.1:133941..146107" to IE68
    # add ;gene=IE68 ;gene=IE175 to the corresponding lines
    # -- python code for convert gff to gtf --
    # open the input file for reading and the output file for writing
    
    # -- scripts choose gene or exon or mRNA --
    # python3 gff2gtf.py #X14112.1.gff-->X14112.1.gtf
    # replace '; transcript_id "gene' to '; transcript_id "tx'
    # !!!!VERY_IMPORTANT!!!!: change type '\tgene\t' to '\texon\t'! 
    # sed -i -e "s/\tgene\t/\texon\t/g" X14112.1_gene.gff # since default is --featurecounts_feature_type 'exon'.
    
  3. nextflow command: the input should be *.umi_extract.fastq.gz.

    #SUCCESSFUL
    (rnaseq) jhuang@hamburg:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ /home/jhuang/anaconda3/bin/nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner hisat2 --fcGroupFeaturesType gene_biotype --skip_genebody_coverage --skip_preseq -resume --saveReference
    
    #NOT_TESTED
    (rnaseq_2021) jhuang@hamm:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner star  -resume --saveReference
    
  4. snippet of the human hg38 gtf served as a pattern

    1       ensembl_havana  gene    685679  686673  .       -       .       gene_id "ENSG00000284662"; gene_version "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
    
    1       ensembl_havana  transcript      685679  686673  .       -       .       gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
    
    1       ensembl_havana  exon    685679  686673  .       -       .       gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; exon_id "ENSE00002324228"; exon_version "3"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
    
    1       ensembl_havana  CDS     685719  686654  .       -       0       gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; protein_id "ENSP00000329982"; protein_version "2"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
    
    1       ensembl_havana  gene    1211340 1214153 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding";
    
    1       havana  transcript      1211340 1214138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2";
    
    1       havana  exon    1213983 1214138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2";
    1       havana  exon    1212992 1213785 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1212638 1212704 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1211942 1212138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2";
    1       ensembl_havana  exon    1211704 1211832 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001333051"; exon_v
    ersion "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  CDS     1211704 1211832 .       -       2       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  exon    1211340 1211625 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001915458"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  CDS     1211558 1211625 .       -       2       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  stop_codon      1211555 1211557 .       -       0       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  five_prime_utr  1214128 1214153 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    1       ensembl_havana  three_prime_utr 1211340 1211554 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)";
    
    1       havana  transcript      1211340 1214138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2";
    1       havana  exon    1213983 1214138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2";
    1       havana  exon    1212992 1213785 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203
    "; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1212638 1212704 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1211942 1212138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2";
    1       havana  exon    1211340 1211832 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001923078"; exon_version "1"; transcript_support_level "2";
    
    1       havana  transcript      1212019 1213498 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "3";
    1       havana  exon    1213395 1213498 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001680308"; exon_version "1"; transcript_support_level "3";
    1       havana  exon    1212992 1213093 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003495433"; exon_version "1"; transcript_support_level "3";
    1       havana  exon    1212638 1212704 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "3";
    1       havana  exon    1212019 1212138 .       -       .       gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001723250"; exon_version "1"; transcript_support_level "3";
    
  5. generate wig files

    samtools faidx X14112.1.fasta
    cut -f1,2 X14112.1.fasta.fai > results/markDuplicates/chrom.sizes
    cd results/markDuplicates/
    for sample in control_r1 control_r2 HSV.d2_r1 HSV.d2_r2 HSV.d4_r1 HSV.d4_r2 HSV.d6_r1 HSV.d6_r2 HSV.d8_r1 HSV.d8_r2; do
        #bamCoverage -b ${sample}.umi_extract.sorted.markDups.bam -o ${sample}.bw
        #bedtools genomecov -ibam ${sample}.umi_extract.sorted.markDups.bam -bg > ${sample}.bedgraph
        bedGraphToBigWig ${sample}.bedgraph chrom.sizes ${sample}.bw
        bigWigToWig ${sample}.bw ${sample}.wig
    done
    
  6. input and clean data using R

    #BiocManager::install(c("DESeq2"))
    requiredPackages1 <-c("AnnotationDbi","clusterProfiler","ReactomePA","org.Hs.eg.db","DESeq2", "gplots", "RColorBrewer")
    ipak <- function(pkg){
            new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
            if (length(new.pkg))
                    install.packages(new.pkg, dependencies = TRUE)
            sapply(pkg, require, character.only = TRUE)
    }
    ipak(requiredPackages1)
    #requiredPackages2 <- c("tidyverse")
    #ipak(requiredPackages2)
    
    #cut -f1-1 merged_gene_counts.txt > col1.txt
    #paste -d$'\t' col1.txt merged_gene_counts2.txt > merged_gene_counts3.txt
    #sed -i 's/gene-//g' merged_gene_counts3.txt
    
    #replace "X14112.1" to "X14112"; delete "rna-gene-"; get X14112.1_gene_extended2.gtf; using it in IGV
    
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library("org.Hs.eg.db")
    library(DESeq2)
    library(gplots)
    setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts")
    
    #---- dataset (27) samples (firstly import all samples, then spring to 27-3-1-2) ----
    d.raw<- read.delim2("merged_gene_counts3.txt",sep="\t", header=TRUE, row.names=1)
    #> head(d.raw,0)
    # [1] HSV.d4_r2.umi_extract.sorted.bam  HSV.d6_r1.umi_extract.sorted.bam 
    # [3] HSV.d4_r1.umi_extract.sorted.bam  control_r1.umi_extract.sorted.bam
    # [5] HSV.d2_r2.umi_extract.sorted.bam  control_r2.umi_extract.sorted.bam
    # [7] HSV.d2_r1.umi_extract.sorted.bam  HSV.d8_r2.umi_extract.sorted.bam 
    # [9] HSV.d8_r1.umi_extract.sorted.bam  HSV.d6_r2.umi_extract.sorted.bam
    
    col.order <-  c("control_r1.umi_extract.sorted.bam","control_r2.umi_extract.sorted.bam", "HSV.d2_r1.umi_extract.sorted.bam","HSV.d2_r2.umi_extract.sorted.bam", "HSV.d4_r1.umi_extract.sorted.bam","HSV.d4_r2.umi_extract.sorted.bam", "HSV.d6_r1.umi_extract.sorted.bam","HSV.d6_r2.umi_extract.sorted.bam", "HSV.d8_r1.umi_extract.sorted.bam","HSV.d8_r2.umi_extract.sorted.bam")
    d <- d.raw[,col.order]  #reordered.raw
    #d <- reordered.raw[rowSums(reordered.raw>3)>2,]
    
    colnames(d) =  c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2")
    
    # Define the replicates and condition of the samples
    ids <- factor(c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2"))
    replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
    condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
    
    # Construct the DESeqDataSet
    cData = data.frame(row.names=colnames(d), replicate=replicate, condition=condition, ids=ids)
    dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~condition)
    
    # Run DESeq (early without the step, WRONG?)
    dds <- DESeq(dds)
    
    # Apply the rlog transformation
    rld <- rlogTransformation(dds)
    #rld <- vst(dds)  #vsd
    
    #-- save raw_data as xls --
    write.csv(d, file="d.csv")
    #~/Tools/csv2xls-0.4/csv_to_xls.py d.csv -d$',' -o d.xls
    
  7. plotting pca and heatmap and remove batchEffect

    # -- pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("condition"))
    #plotPCA(rld, intgroup = c("condition", "batch"))
    #plotPCA(rld, intgroup = c("condition", "ids"))
    #plotPCA(rld, "batch")
    dev.off()
    
    # -- heatmap --
    ## generate the pairwise comparison between samples
    library(gplots) 
    library("RColorBrewer")
    png("heatmap.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    #paste( rld$dex, rld$cell, sep="-" )
    #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,batch, sep=":"))
    rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,ids, sep=":"))
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
  8. select the differentially expressed genes

    #https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
    #https://www.biostars.org/p/282295/
    #https://www.biostars.org/p/335751/
    
    #> condition
    # [1] control control HSV.d2  HSV.d2  HSV.d4  HSV.d4  HSV.d6  HSV.d6  HSV.d8  HSV.d8 
    #Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8
    
    #CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes
    setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes")
    #---- relevel to control ----
    dds$condition <- relevel(dds$condition, "control")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control")
    
    dds$condition <- relevel(dds$condition, "HSV.d2")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2")
    
    dds$condition <- relevel(dds$condition, "HSV.d4")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4")
    
    dds$condition <- relevel(dds$condition, "HSV.d6")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d8_vs_HSV.d6")
    
    for (i in clist) {
      contrast = paste("condition", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      res_df <- as.data.frame(res)
    
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
      up <- subset(res_df, padj<=0.05 & log2FoldChange>=1.2)
      down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1.2)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
    ##https://github.com/kevinblighe/EnhancedVolcano
    #BiocManager::install("EnhancedVolcano")
    library("EnhancedVolcano")
    #for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control; do
    #for i in HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2; do
    #for i in HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4; do
    for i in HSV.d8_vs_HSV.d6; do
      echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")"
      echo "res = results(dds, name=contrast)"
      #echo "res <- res[!is.na(res$log2FoldChange),]"
      echo "res <- na.omit(res)"
      echo "res_df <- as.data.frame(res)"
      #selectLab = selectLab_italics,
      echo "png(\"${i}.png\",width=1200, height=1000)"
      #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
      echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
      echo "dev.off()"
    done
    
    #under DIR degenes under KONSOLE
    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
      echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
    done
    

9 (optional). clustering the genes and draw heatmap

    install.packages("gplots")
    library("gplots")

    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
      echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
      echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
    done

    cat *.id | sort -u > ids
    #add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id
    RNASeq.NoCellLine <- assay(rld)

    #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
    datamat = RNASeq.NoCellLine[GOI, ]
    write.csv(as.data.frame(datamat), file ="significant_gene_expressions.txt")
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.5)
    mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
    mycol = mycol[as.vector(mycl)]

    #png("DEGs_heatmap.png", width=900, height=800)
    #cex.lab=10, labRow="",
    png("DEGs_heatmap.png", width=900, height=1000)
    heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
                scale='row',trace='none',col=bluered(75), 
                RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
    #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
    dev.off()

    #### cluster members #####
    write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
    write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
    write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
    #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls

    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    significant_gene_expressions.txt \
    -d',' -o DEGs_heatmap_expression_data.xls;

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum