RNA-seq skin organoids on GRCh38+chrHsv1

gene_x 0 like s 468 view s

Tags: R, pipeline

  1. import data and pca-plot

    # Import the required libraries
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library(gplots)
    library(tximport)
    library(DESeq2)
    setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon")
    # Define paths to your Salmon output quantification files
    files <- c("control_r1" = "./control_r1/quant.sf",
              "control_r2" = "./control_r2/quant.sf",
              "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
              "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
              "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
              "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
              "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
              "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
              "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
              "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
    # Import the transcript abundance data with tximport
    txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    # Define the replicates and condition of the samples
    replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
    condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
    # Define the colData for DESeq2
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    # Create DESeqDataSet object
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
    # Filter data to retain only genes with more than 2 counts > 3 across all samples
    # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
    # Output raw count data to a CSV file
    write.csv(counts(dds), file="transcript_counts.csv")
    
    # -- gene-level count data --
    # Read in the tx2gene map from salmon_tx2gene.tsv
    #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
    tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    # Set the column names
    colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    # Remove the gene_name column if not needed
    tx2gene <- tx2gene[,1:2]
    # Import and summarize the Salmon data with tximport
    txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    # Continue with the DESeq2 workflow as before...
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
    
    write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    #~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls
    #TODO: why a lot of reads were removed due to the too_short?
    #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
    dim(counts(dds))
    head(counts(dds), 10)
    
    #DEBUG: DESeq should not used here!?
    #TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 
    #dds <- DESeq(dds)
    rld <- rlogTransformation(dds)
    
    # draw simple pca and heatmap
    library(gplots) 
    library("RColorBrewer")
    #mat <- assay(rld)
    #mm <- model.matrix(~condition, colData(rld))
    #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
    #assay(rld) <- mat
    # -- pca --
    png("pca.png", 1200, 800)
    plotPCA(rld, intgroup=c("condition"))
    dev.off()
    # -- heatmap --
    png("heatmap.png", 1200, 800)
    distsRL <- dist(t(assay(rld)))
    mat <- as.matrix(distsRL)
    hc <- hclust(distsRL)
    hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
    heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
    dev.off()
    
  2. draw 3D PCA plots.

    library(gplots) 
    library("RColorBrewer")
    
    library(ggplot2)
    data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)
    write.csv(data, file="plotPCA_data.csv")
    #calculate all PCs including PC3 with the following codes
    library(genefilter)
    ntop <- 500
    rv <- rowVars(assay(rld))
    select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
    mat <- t( assay(rld)[select, ] )
    pc <- prcomp(mat)
    pc$x[,1:3]
    #df_pc <- data.frame(pc$x[,1:3])
    df_pc <- data.frame(pc$x)
    identical(rownames(data), rownames(df_pc)) #-->TRUE
    
    data$PC1 <- NULL
    data$PC2 <- NULL
    merged_df <- merge(data, df_pc, by = "row.names")
    #merged_df <- merged_df[, -1]
    row.names(merged_df) <- merged_df$Row.names
    merged_df$Row.names <- NULL  # remove the "name" column
    merged_df$name <- NULL
    merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")]
    write.csv(merged_df, file="merged_df_10PCs.csv")
    summary(pc)  
    #0.5333  0.2125 0.06852
    #0.8026 0.09042 0.06578
    
    draw_3D.py
    
    # adjust proportion to real values in the following plot
    import plotly.graph_objects as go
    import pandas as pd
    from sklearn.decomposition import PCA
    import numpy as np
    # Read in data as a pandas dataframe
    #df = pd.DataFrame({
    #    'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215],
    #    'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993],
    #    'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358],
    #    'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'],
    #    'replicate': ['DI', 'DII', 'DI', 'DII', 'DI']
    #})
    df = pd.read_csv('merged_df_10PCs.csv', index_col=0, header=0)
    df['condition'] = df['condition'].replace("control", "control")
    df['condition'] = df['condition'].replace("HSV.d2", "day 2")
    df['condition'] = df['condition'].replace("HSV.d4", "day 4")
    df['condition'] = df['condition'].replace("HSV.d6", "day 6")
    df['condition'] = df['condition'].replace("HSV.d8", "day 8")
    # Fit PCA model to reduce data dimensions to 3
    pca = PCA(n_components=3)
    pca.fit(df.iloc[:, :-3])
    X_reduced = pca.transform(df.iloc[:, :-3])
    
    # Get variance ratios
    explained_variance_ratio = pca.explained_variance_ratio_
    
    # Add reduced data back to dataframe
    df['PC1'] = X_reduced[:, 0]
    df['PC2'] = X_reduced[:, 1]
    df['PC3'] = X_reduced[:, 2]
    # Create PCA plot with 3D scatter
    fig = go.Figure()
    
    ##ff7f00
    condition_color_map = {
        'control': 'rgb(100, 100, 100)',
        'day 2': '#33a02c',
        'day 4': '#1f78b4',
        'day 6': '#e31a1c',
        'day 8': 'magenta'
    }
    replicate_symbol_map = {'r1': 'circle', 'r2': 'diamond'}
    for replicate, replicate_symbol in replicate_symbol_map.items():
        for condition, condition_color in condition_color_map.items():
            mask = (df['condition'] == condition) & (df['replicate'] == replicate)
            fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'],
                                        mode='markers',
                                        name=f'{condition}' if replicate == 'r1' else None,
                                        legendgroup=f'{condition}',
                                        showlegend=True if replicate == 'r1' else False,
                                        marker=dict(size=6 if replicate_symbol in ['diamond'] else 10, opacity=0.8, color=condition_color, symbol=replicate_symbol)))
    for replicate, replicate_symbol in replicate_symbol_map.items():
        fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None],
                                    mode='markers',
                                    name=replicate,
                                    legendgroup=f'{replicate}',
                                    showlegend=True,
                                    marker=dict(size=10, opacity=1, color='black', symbol=replicate_symbol),
                                    hoverinfo='none'))
    # Annotations for the legend blocks
    #TODO: calculate the PC values.
    #TODO: adjust the axis length according to the actual size of axis!
    fig.update_layout(
        annotations=[
            dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
                  text='Condition', font=dict(size=15)),
            dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
                  text='Replicate', font=dict(size=15))
        ],
        scene=dict(
            #aspectmode='cube',
            #xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', scaleratio=0.53),
            #yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', scaleratio=0.21),
            #zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', scaleratio=0.07),
            aspectmode='manual',
            aspectratio=dict(x=explained_variance_ratio[0], y=explained_variance_ratio[1], z=explained_variance_ratio[2]),
            xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', range=[min(df['PC1']), max(df['PC1'])]),
            yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', range=[min(df['PC2']), max(df['PC2'])]),
            zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', range=[min(df['PC3']), max(df['PC3'])]),
    
            bgcolor='white'
        ),
        margin=dict(l=5, r=5, b=5, t=0)  # Adjust the margins to prevent clipping of axis titles
    )
    #fig.show()
    fig.write_image("fig1.svg")
    
    fig.update_layout(
        annotations=[
            dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False,
                text='Condition', font=dict(size=15)),
            dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False,
                text='Donor', font=dict(size=15)),
            dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False,
                text=f'PC3: {explained_variance_ratio[2]*100:.2f}% v.', font=dict(size=15), textangle=-90)
        ],
        scene=dict(
            aspectmode='manual',
            aspectratio=dict(x=explained_variance_ratio[0]*2, y=explained_variance_ratio[1]*2, z=explained_variance_ratio[2]*2),
            #, range=[min(df['PC1']), max(df['PC1'])]
            #, range=[min(df['PC2']), max(df['PC2'])]
            #, range=[min(df['PC3']), max(df['PC3'])]
            xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC1: {explained_variance_ratio[0]*100:.2f}% variance'),
            yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC2: {explained_variance_ratio[1]*100:.2f}% v.'),
            zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=''),
            bgcolor='white'
        ),
        margin=dict(l=0, r=0, b=0, t=0)  # Adjust the margins to prevent clipping of axis titles
    )
    fig.write_image("PCA_3D.svg")
    #/usr/bin/convert g235.png -crop 3250x1680+1+750 PCA_3D_.png
    
  3. (optional) estimate size factors

    > head(dds)
    class: DESeqDataSet 
    dim: 6 10 
    metadata(1): version
    assays(6): counts avgTxLength ... H cooks
    rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460
      ENSG00000000938
    rowData names(34): baseMean baseVar ... deviance maxCooks
    colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2
    colData names(2): condition replicate
    
    #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
    sizeFactors(dds)
    #NULL
    dds <- estimateSizeFactors(dds)
    > sizeFactors(dds)
    
    raw_counts <- counts(dds)
    normalized_counts <- counts(dds, normalized=TRUE)
    #write.table(raw_counts, file="raw_counts.txt", sep="\t", quote=F, col.names=NA)
    #write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
    
    # ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ----
    nm <- assays(dds)[["avgTxLength"]]
    sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm)
    
    assays(dds)$counts  # for count data
    assays(dds)$avgTxLength  # for average transcript length, etc.
    assays(dds)$normalizationFactors
    
    In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet.
    
    To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
    > assays(dds)
    List of length 6
    names(6): counts avgTxLength normalizationFactors mu H cooks
    
    To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
    
    geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
    sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE)
    
    # ---- DEBUG END ----
    
    #unter konsole
    #  control_r1  ...
    # 1/0.9978755  ...
    
    > sizeFactors(dds)
                        HeLa_TO_r1                      HeLa_TO_r2 
                          0.9978755                       1.1092227
    
    1/0.9978755=1.002129023
    1/1.1092227=
    
    #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
    bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023     --effectiveGenomeSize 2864785220
    bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor  0.901532217        --effectiveGenomeSize 2864785220
    
  4. compare the normalization methods

    # ---- draw normalization before and after ----
    ### Let's implement such a function
    ### cds is a countDataset
    estimSf <- function (cds){
        # Get the count matrix
        cts <- counts(cds)
    
        # Compute the geometric mean
        geomMean <- function(x) prod(x)^(1/length(x))
    
        # Compute the geometric mean over the line
        gm.mean  <-  apply(cts, 1, geomMean)
    
        # Zero values are set to NA (avoid subsequentcdsdivision by 0)
        gm.mean[gm.mean == 0] <- NA
    
        # Divide each line by its corresponding geometric mean
        # sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...)
        # MARGIN: 1 or 2 (line or columns)
        # STATS: a vector of length nrow(x) or ncol(x), depending on MARGIN
        # FUN: the function to be applied
        cts <- sweep(cts, 1, gm.mean, FUN="/")
    
        # Compute the median over the columns
        med <- apply(cts, 2, median, na.rm=TRUE)
    
        # Return the scaling factor
        return(med)
    }
    #https://dputhier.github.io/ASG/practicals/rnaseq_diff_Snf2/rnaseq_diff_Snf2.html
    #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization
    #https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html
    #https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html
    #https://genviz.org/module-04-expression/0004/02/01/DifferentialExpression/
    #DESeq2’s median of ratios [1]
    #EdgeR’s trimmed mean of M values (TMM) [2]
    #http://www.nathalievialaneix.eu/doc/html/TP1_normalization.html  #very good website!
    test_normcount <- sweep(raw_counts, 2, sizeFactors(dds), "/")
    sum(test_normcount != normalized_counts)
    
    head(estimSf(dds))
    all(round(estimSf(dds),6) == round(sizeFactors(dds), 6))
    ## Checking the normalization
    png("normalization.png", width=800, height=600)
    epsilon <- 1 # pseudo-count to avoid problems with log(0)
    par(mfrow=c(1,2),cex.lab=0.7)
    boxplot(log2(raw_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(raw counts)", horizontal=TRUE, main="Raw counts")
    boxplot(log2(normalized_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts") 
    #boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2)
    #plotDensity(log2(counts(dds.norm)+epsilon),  col=col.pheno.selected, 
    #            xlab="log2(counts)", cex.lab=0.7, panel.first=grid()) 
    #plotDensity(log2(counts(dds.norm, normalized=TRUE)+epsilon), col=col.pheno.selected, 
    #            xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) 
    dev.off()
    
    # since we Gene-level differential expression analysis with DESeq2, the splicing plays no role in the analysis!
    # 用nanopore 可以 compare transcript length distribution. 有可能Cellline很长,Extracellular vesicles (EVs)很短!
    
    library(ggplot2)
    library(gridExtra)
    library(reshape2)
    library(mixOmics)
    library(RColorBrewer)
    library(DESeq)
    library(edgeR)
    library(VennDiagram)
    library(devtools)
    raw_counts_wn <- raw_counts[rowSums(raw_counts) > 0, ]
    dim(raw_counts_wn)
    
    #--Raw counts--
    pseudo_counts <- log2(raw_counts_wn + 1)
    head(pseudo_counts)
    df_raw <- melt(pseudo_counts, id = rownames(raw_counts_wn))
    names(df_raw)[1:2]<- c("id", "sample")
    df_raw$method <- rep("Raw counts", nrow(df_raw))  
    head(df_raw)
    
    #--DESeq--
    cData = data.frame(row.names=colnames(raw_counts_wn), replicates=replicates, ids=ids)
    dge<-DESeqDataSetFromMatrix(countData=raw_counts_wn, colData=cData, design=~replicates)
    dge <- estimateSizeFactors(dge)
    sizeFactors(dge)
    deseq_normcount <- counts(dge, normalized = TRUE)
    test_normcount <- sweep(raw_counts_wn, 2, sizeFactors(dge), "/")
    sum(test_normcount != deseq_normcount)
    pseudo_deseq <- log2(deseq_normcount + 1)
    df_deseq <- melt(pseudo_deseq, id = rownames(raw_counts_wn))
    names(df_deseq)[1:2]<- c("id", "sample")
    df_deseq$method <- rep("DESeq (RLE)", nrow(df_raw))
    
    #--edgeR--
    dge2 <- DGEList(raw_counts_wn)
    dge2
    dge2$samples
    
    #--Total count--
    pseudo_TC <- log2(cpm(dge2) + 1)
    df_TC <- melt(pseudo_TC, id = rownames(raw_counts_wn))
    names(df_TC)[1:2] <- c ("id", "sample")
    df_TC$method <- rep("TC", nrow(df_TC))
    
    ##--RPKM--
    #gene_lengths_wn <- gene_lengths[rowSums(raw_counts) > 0]
    #pseudo_RPKM <- log2(rpkm(dge2, gene.length = gene_lengths_wn) + 1)
    #df_RPKM <- melt(pseudo_RPKM, id = rownames(raw_counts_wn))
    #names(df_RPKM)[1:2] <- c ("id", "sample")
    #df_RPKM$method <- rep("RPKM", nrow(df_RPKM))
    
    #--Upper quartile--
    dge2 <- calcNormFactors(dge2, method = "upperquartile")
    dge2$samples
    test_normcount <- sweep(dge2$counts, 2,
                            dge2$samples$lib.size*dge2$samples$norm.factors / 10^6,
                            "/")
    range(as.vector(test_normcount - cpm(dge2)))
    pseudo_UQ <- log2(cpm(dge2) + 1)
    
    df_UQ <- melt(pseudo_UQ, id = rownames(raw_counts_wn))
    names(df_UQ)[1:2] <- c ("id", "sample")
    df_UQ$method <- rep("UQ", nrow(df_UQ))
    
    #--TMM--
    dge2 <- calcNormFactors(dge2, method = "TMM")
    dge2$samples
    pseudo_TMM <- log2(cpm(dge2) + 1)
    df_TMM <- melt(pseudo_TMM, id = rownames(raw_counts_wn))
    names(df_TMM)[1:2] <- c ("id", "sample")
    #MODIFIED!
    df_TMM$method <- rep("DESeq (RLE)", nrow(df_TMM))  #TMM
    
    #--Comparison--
    png("normalization.png", width=800, height=600)
    #df_allnorm <- rbind(df_raw, df_deseq, df_TC, df_UQ, df_TMM)
    #df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)", "TC",  "TMM", "UQ"))
    df_allnorm <- rbind(df_raw, df_TMM)
    df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)"))
    p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method))
    p <- p + geom_boxplot()  
    p <- p + theme_bw()
    p <- p + ggtitle("Boxplots of normalized pseudo counts\n
    for all samples by normalization methods")
    p <- p + facet_grid(. ~ method) 
    p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("")
    p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
                  axis.ticks.x = element_blank())
    print(p)
    dev.off()
    
  5. select the differentially expressed genes

    #https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/
    #https://www.biostars.org/p/282295/
    #https://www.biostars.org/p/335751/
    
    #> condition
    # [1] control control HSV.d2  HSV.d2  HSV.d4  HSV.d4  HSV.d6  HSV.d6  HSV.d8  HSV.d8 
    #Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8
    
    #CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes
    setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes")
    #---- relevel to control ----
    dds$condition <- relevel(dds$condition, "control")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control")
    
    dds$condition <- relevel(dds$condition, "HSV.d2")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2")
    
    dds$condition <- relevel(dds$condition, "HSV.d4")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4")
    
    dds$condition <- relevel(dds$condition, "HSV.d6")
    dds = DESeq(dds, betaPrior=FALSE)
    resultsNames(dds)
    clist <- c("HSV.d8_vs_HSV.d6")
    
    for (i in clist) {
      contrast = paste("condition", i, sep="_")
      res = results(dds, name=contrast)
      res <- res[!is.na(res$log2FoldChange),]
      res_df <- as.data.frame(res)
    
      write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
      up <- subset(res_df, padj<=0.05 & log2FoldChange>=1)
      down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1)
      write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
      write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
    }
    
      echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")"
      echo "res = results(dds, name=contrast)"
      #echo "res <- res[!is.na(res$log2FoldChange),]"
      echo "res <- na.omit(res)"
    
    ##https://github.com/kevinblighe/EnhancedVolcano
    #BiocManager::install("EnhancedVolcano")
    library("EnhancedVolcano")
    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
    #for i in HSV.d8_vs_control; do
      echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)"
      echo "res_df <- as.data.frame(res)"
      echo "png(\"${i}.png\",width=800, height=600)"
      #legendPosition = 'right',legendLabSize = 12,  arrowheads = FALSE,
      #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))"
      echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))"
      echo "dev.off()"
    done
    
    #DEBUG: why some genes in HSV.d8 in control high regulated --> ERROR! We should keep the number of reads in the raw counts, leading all genes low regulated! Not using the default normalization method!!!!
    #res <- read.csv(file = paste("HSV.d8_vs_control", "all.txt", sep="-"), row.names = 1)
    #res_df <- as.data.frame(res)
    #png("HSV.d8_vs_control.png",width=800, height=600)
    #EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("HSV.d8 versus control"))
    #dev.off()
    
    #under DIR degenes under KONSOLE
    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; done
    
  6. clustering the genes and draw heatmap

    for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done
    
    cat *.id | sort -u > ids
    #add Gene_Id in the first line, delete the ""
    GOI <- read.csv("ids")$Gene_Id
    RNASeq.NoCellLine <- assay(rld)
    
    #install.packages("gplots")
    library("gplots")
    
    #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
    #datamat = RNASeq.NoCellLine[GOI, ]
    datamat = RNASeq.NoCellLine
    write.csv(as.data.frame(datamat), file ="gene_expressions.txt")
    constant_rows <- apply(datamat, 1, function(row) var(row) == 0)
    if(any(constant_rows)) {
      cat("Removing", sum(constant_rows), "constant rows.\n")
      datamat <- datamat[!constant_rows, ]
    }
    hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
    hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
    mycl = cutree(hr, h=max(hr$height)/1.5)
    mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
    mycol = mycol[as.vector(mycl)]
    
    #png("DEGs_heatmap.png", width=900, height=800)
    #cex.lab=10, labRow="",
    png("DEGs_heatmap.png", width=900, height=1000)
    heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
                scale='row',trace='none',col=bluered(75), 
                RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8))  #rownames(datamat)  
    #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
    dev.off()
    
    #### cluster members #####
    write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
    write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') 
    write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')  
    #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
    
    ~/Tools/csv2xls-0.4/csv_to_xls.py \
    significant_gene_expressions.txt \
    -d',' -o DEGs_heatmap_expression_data.xls;
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum