RNA-Seq Analysis of the WaGa samples

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

1 Libraries

Let’s start by loading all of the libraries that will be used throughout this analysis.

# Make sure results are reproducible
set.seed(1)

# Load required libraries
library('Biobase')
library('biomaRt') 
library('coop')
library('doParallel')
library('flashClust')
library('foreach')
library('ggplot2')
library('goseq')
#library('heatmap.plus')
library('GO.db')
library('gplots')
library('knitcitations')
library('limma')
library('Matrix')
library('plyr')
library('dplyr')
library('readr')
library('tools')
library('rtracklayer')
library('parallelDist')
library('preprocessCore')
library('printr')
library('reshape2')
library('RColorBrewer')
library('RCurl')
library('sva')
library('viridis')
#library('annotables')
library('hpgltools')

library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
#library("org.Mm.eg.db")
library("DESeq2")
library("gplots")




# Load helper functions
source('../../R/annotations.R')
source('../../R/count_tables.R')
source('../../R/differential_expression.R')
source('../../R/enrichment_analysis.R')
source('../../R/filtering.R')
source('../../R/pca.R')
source('../../R/plots.R')
source('../../R/util.R')

# Output format-specific options
if (opts_knit$get("rmarkdown.pandoc.to") == 'html') {
    # Interaction datatable used for HTML output
    library('DT')

    # Side-by-side plots (HTML)
    combined_plot_width = '400px'
} else {
    # Side-by-side plots (PDF)
    combined_plot_width = '3.5in'
}

## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::as_tibble()` instead.

# Combine into a single configuration
CONFIG <- merge.list(CONFIG, merge.list(DEFAULT_CONFIG_SHARED,
                                        DEFAULT_CONFIG_DEA))
rm(DEFAULT_CONFIG_SHARED, DEFAULT_CONFIG_DEA)

opts_chunk$set(fig.width=CONFIG$fig_width/CONFIG$dpi,
               fig.height=CONFIG$fig_height/CONFIG$dpi,
               fig.retina=1,
               dpi=CONFIG$dpi)

2 Load RNA-Seq counts

2.1 Samples

# Samples included in this analysis
if (CONFIG$include_tables) {
  kable(head(CONFIG$samples[, 1:min(6, ncol(CONFIG$samples))], 50), 
        caption='RNA-Seq samples.')
}

RNA-Seq samples.
sample_id	condition	replicate	batch
1107_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	sT.DMSO.EV	rep1	batch2
1605_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	sT.DMSO.EV	rep2	batch2
2706_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	sT.DMSO.EV	rep3	batch2
1107_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	scr.DMSO.EV	rep1	batch2
1605_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	scr.DMSO.EV	rep2	batch2
2706_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	scr.DMSO.EV	rep3	batch2
1107_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	sT.Dox.EV	rep1	batch2
1605_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	sT.Dox.EV	rep2	batch2
2706_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	sT.Dox.EV	rep3	batch2
1107_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	scr.Dox.EV	rep1	batch2
1605_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	scr.Dox.EV	rep2	batch2
2706_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	scr.Dox.EV	rep3	batch2
1107_WaGa_wt_EVAligned.sortedByCoord.out.bam	wt.EV	rep1	batch2
1605_WaGa_wt_EVAligned.sortedByCoord.out.bam	wt.EV	rep2	batch2
2706_WaGa_wt_EVAligned.sortedByCoord.out.bam	wt.EV	rep3	batch2
WaGa_RNAAligned.sortedByCoord.out.bam	RNA	rep1	batch1
WaGa_RNA_118Aligned.sortedByCoord.out.bam	RNA	rep2	batch1
WaGa_RNA_147Aligned.sortedByCoord.out.bam	RNA	rep3	batch1

2.2 Load sample information

To make things easier, all of the relevant experiment sample information including sample ID, condition, and batch have been stored in separate CSV files.

During the processing of the analysis configuration above, the relevant sample metadata was loaded. Here we create a few variables containing different subsets of this information that will be useful throughout the downstream analyses.

During the differential expression analysis, we will use a larger subset of the samples in order to improve our estimation of variance and batch effects.

# Sample metadata
sample_ids <- as.character(CONFIG$samples[[CONFIG$sample_id]])
condition  <- factor(CONFIG$samples[[CONFIG$condition]])
replicate  <- factor(CONFIG$samples[[CONFIG$replicate]])
batch      <- factor(CONFIG$samples[[CONFIG$batch]])

# Covariate dataframe
covariates <- CONFIG$samples %>% select(!!CONFIG$covariates)

# Make sure levels for batch and condition are valid
#levels(condition) <- make.names(levels(condition))
#levels(batch) <- make.names(levels(batch))

# Design matrix; this may be updated later if batch adjustment is enabled
design_condition_only  <- model.matrix(~0+condition)
design_cond_only <- model.matrix(~condition)

if (length(levels(batch)) > 0) {
    design_including_batch <- model.matrix(~0+condition+batch)
    design_incl_batch <- model.matrix(~condition+batch)
}

2.3 Load count tables

Next, we will load in the count data for each sample. Throughout our analyses we will apply several different types of filters and data transformations. Since the exact types of normalization and filtering procedures we want to apply may differ, depending on the type of analysis we are interested in (e.g. differential expression vs. co-expression network analysis), for each type of analysis, we will maintain a list with the count tables at each of the major steps in processing. Whenever genes or samples are removed from one version of the count table for that analysis, the same genes and samples will be removed from all other versions of the count table. This way we can ensure that the dimensions of the count matrix is consistent across all levels of processing, within a given analysis.

The different stages of processing which will be maintained are:

raw: Unmodified count values as generated by featureCounts, or another quantification tool.
rld: rlog transformed.
normed: Counts with zero or more of the follow transformations: log2, CPM, Voom, quantile normalization
batch_adjusted: Same as above, but with an (optional) batch adjustment step. If “batch_adjust” is set to “none”, this will be the same as the normed version of the counts.
final: The final version of the counts after all filtering and transformation steps have been applied.

#MODVE from ABOVE: 
message("Preparing count matrix")

## Preparing count matrix

print("Preparing count matrix")

[1] “Preparing count matrix”

# Use pre-loaded count table
#if (is.data.frame(CONFIG$input_count_tables)) {
#    count_table <- CONFIG$input_count_tables
#} else if (length(CONFIG$input_count_tables) == 1 && tolower(file_ext(CONFIG$input_count_tables)) %in% c('rda', 'rdata')) {
#    # Load from RData
#    print("Load from RData")
#    # Check to see if count table has been loaded, and if not, load
#    if (!CONFIG$input_count_var %in% ls()) {
#      load(CONFIG$input_count_tables)
#    }
#
#    # assign to variable "dat"
#    dat <- get(CONFIG$input_count_var)
#
#    # ExpressionSet
#    if (class(dat) == 'ExpressionSet') {
#        count_table <- exprs(dat)
#        attributes(count_table)$names <- NULL
#    } else if (class(dat) == 'RangedSummarizedExperiment') {
#        # RangedSummarizedExperiment
#        count_table <- SummarizedExperiment::assays(dat)$counts
#    }
#} else  {
    # Find all matching count files
    count_files <- Sys.glob(CONFIG$input_count_tables)

    # Single combined count file
    message("Single combined count file")

## Single combined count file

    if (length(count_files) == 1) {
        # Load combined count table
        # TODO: Generalize so that counts stored in arbitrary formats can be loaded
        count_table <- read.table(CONFIG$input_count_tables, header=TRUE,
                                  row.names=1, check.names=FALSE) 
    } else {
        # iterate over per-sample counts and append to count matrix
        count_table <- read.delim(count_files[1], header = FALSE, row.names = 1)
    
        COUNT_COLUMN_IDX <- 2
    
        for (infile in count_files[2:length(count_files)]) {
          sample_counts <- read.delim(infile, header = FALSE)
          count_table <- cbind(count_table, sample_counts[, COUNT_COLUMN_IDX])
        }
    
        # fix column names
        colnames(count_table) <- file_path_sans_ext(basename(count_files))
    
        count_table <- as.matrix(count_table)
    }

    # Make sure featureCounts meta-columns are removed (should be taken care of in
    # recent versions of HPGLtools)
    count_table <- count_table[!grepl('^X?__', rownames(count_table)),]
#}

# Remove ENSEMBL gene id version, if present
# e.g. "ENSG00000005421.8" -> "ENSG00000005421"
if (startsWith(rownames(count_table)[1], 'ENS')) {
    rownames(count_table) <- sub('\\.[0-9]*', '', rownames(count_table))
}

# Exclude any missing samples
#sample_ids <- sample_ids[sample_ids %in% colnames(count_table)]

# Filter out samples not needed for this analysis
count_table <- count_table[, sample_ids]

# Drop any rows with NA's (should only occur when comparing counts across
# experiments where different reference GFF's were used)
count_table <- count_table[complete.cases(count_table),]

# print initial size of raw count table
if (CONFIG$verbose) {
    cat(sprintf("**Count table dimensions:**\n"))
    cat(sprintf("- Rows: %d\n", nrow(count_table)))
    cat(sprintf("- Columns: %d\n", ncol(count_table)))
}

Count table dimensions: - Rows: 60664 - Columns: 18

sum(count_table)

## [1] 76153959

3 Load annotations

3.1 Load gene annotations

Next, we will use Bioconductor OrganismDb https://www.bioconductor.org/packages/devel/bioc/manuals/OrganismDbi/man/OrganismDbi.pdf packages citep(citation('OrganismDbi')) and the R interface to BioMart citep(c('10.1186/1471-2164-10-22', '10.1038/nprot.2009.97')) to retrieve ENSEMBL gene annotations for the host.

# Load gene and transcript annotations
library(CONFIG$organism_db, character.only=TRUE)
#OrganismDb Object
orgdb <- get(CONFIG$organism_db)

# BUG (Must be fixed!) Fix AnnotationDbi namespace mess
assign('select', dplyr::select, envir=.GlobalEnv)
assign('get',    base::get, envir=.GlobalEnv)

# main ensembl host is down fairly often, so alt hosts may be needed.
gene_info <- load_host_annotations(orgdb, rownames(count_table),
                                   keytype=CONFIG$orgdb_key,
                                   biomart_dataset=CONFIG$biomart_dataset,
                                   biomart_host='www.ensembl.org')
                                   #biomart_host='useast.ensembl.org')
                                   #biomart_host='aug2017.archive.ensembl.org')
                                   #biomart_host='archive.ensembl.org')

#https://bioconductor.org/packages/devel/bioc/vignettes/AnnotationDbi/inst/doc/IntroToAnnotationPackages.pdf
# Keep only the feature information remaining genes
gene_info <- gene_info[gene_info$gene_id %in% rownames(count_table),]

# For now, just grab the description for the first transcript
gene_info <- gene_info[!duplicated(gene_info$gene_id), ]

# Gene IDs
gene_ids <- rownames(count_table)

# Determine approximate transcript lengths for each gene.
# Note that because the annotations have not been filtered to remove ncRNAs,
# many of the genes associated with lncRNAs, etc. will not have a length
# associated with them.
gene_info$transcript_length <- getlength(gene_ids, CONFIG$organism_genome, "ensGene")
# -- DEBUG --
#Can't find hg38/ensGene length data in genLenDataBase...
#A TxDb annotation package exists for hg38. Consider installing it to get the gene lengths.
#Trying to download from UCSC. This might take a couple of minutes.
#if (!require("BiocManager", quietly = TRUE))
#    install.packages("BiocManager")
#BiocManager::install("BSgenome.Hsapiens.UCSC.hg38")
#BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene")
#library("BSgenome.Hsapiens.UCSC.hg38")
#library("TxDb.Hsapiens.UCSC.hg38.knownGene")
#?supportedGenomes()

# Note 2017/12/09
# ---------------
# hg19 still better to use for tx lengths than hg38 for the moment:
# > sum(is.na(getlength(gene_ids, 'hg38', "ensGene")))                                                                                                                                                        
#   [1] 36591
# > sum(is.na(getlength(gene_ids, 'hg19', "ensGene")))                                                                                                                                                           
#   [1] 24462     
# Note 2023/02/08
# ---------------
#Loading hg19 length data...
#[1] 25670


# Location of external annotation files
species_dir <- tolower(sub('. ', '', CONFIG$host))
input_dir <- file.path('..', '..', 'data', species_dir)

# gene annotations
if (CONFIG$include_tables) {
  kable(head(gene_info), caption='Preview of gene annotations.')
}

Preview of gene annotations.
gene_id	chromosome	description	strand	type	transcript_length
ENSG00000284662	chr5	olfactory receptor family 4 subfamily F member 16	+	protein_coding	629
ENSG00000186827	chr1	TNF receptor superfamily member 4	-	protein_coding	1075
ENSG00000186891	chr1	TNF receptor superfamily member 18	-	protein_coding	789
ENSG00000160072	chr1	ATPase family AAA domain containing 3B	+	protein_coding	2114
ENSG00000041988	chr1	THAP domain containing 3	+	protein_coding	931
ENSG00000260179	NA	NA	NA	lncRNA	NA

3.2 Load GO annotations

Next, we will use the GO.db annotation package citep(citation('GO.db')) to load a table of Gene Ontology (GO) citep('10.1038/75556') terms associated with each parasite gene.

# Load GO annotations (for host species, exclude ancestor terms to speed things
# up; goseq will handle this for us later on)
go_terms <- load_go_terms(orgdb, rownames(count_table), keytype=CONFIG$orgdb_key,
                          include_ancestors=FALSE)

# Gene / GO term mapping
gene_go_mapping <- as.data.frame(unique(
  go_terms %>% select(.data[[CONFIG$orgdb_key]], GO, ONTOLOGY)
))
colnames(gene_go_mapping) <- c('gene', 'category', 'ontology')
go_term_id_mapping <- as.data.frame(unique(go_terms[c('GO', 'TERM', 'ONTOLOGY')]))
colnames(go_term_id_mapping) <- c("category", "term", "ontology")

# Preview of GO term annotations
if (CONFIG$include_tables) {
  kable(head(go_terms), caption='Preview of GO annotations.')
}

Preview of GO annotations.
GO	ENSEMBL	TERM	ONTOLOGY
GO:0000002	ENSG00000114120	mitochondrial genome maintenance	BP
GO:0000002	ENSG00000025708	mitochondrial genome maintenance	BP
GO:0000002	ENSG00000068305	mitochondrial genome maintenance	BP
GO:0000002	ENSG00000117020	mitochondrial genome maintenance	BP
GO:0000002	ENSG00000171612	mitochondrial genome maintenance	BP
GO:0000002	ENSG00000115204	mitochondrial genome maintenance	BP

3.3 Load KEGG annotations

Next, we will load a table of KEGG citep('10.1093/nar/27.1.29') pathway annotations for each host gene.

# For mouse/human, KEGG mapping has to be loaded separately

# Human
if (CONFIG$host == 'H. sapiens') {
  kegg_mapping_file  <- file.path(input_dir, 'Hsapiens_KEGG_Annotations.csv')
  kegg_pathways_file <- file.path(input_dir, 'Hsapiens_KEGG_Pathways.csv')
  org_abbreviation <- 'hsa'
} else if (CONFIG$host == 'M. musculus') {
  # Mouse
  kegg_mapping_file  <- file.path(input_dir, 'Mmusculus_KEGG_Annotations.csv')
  kegg_pathways_file <- file.path(input_dir, 'Mmusculus_KEGG_Pathways.csv')
  org_abbreviation <- 'mmu'
}

if (file.exists(kegg_mapping_file)) {
  # If KEGG mapping are available, load from file
  gene_kegg_mapping <- read.csv(kegg_mapping_file)
  kegg_pathways <- read.delim(kegg_pathways_file)
} else {
  # Otherwise use KEGGREST to construct mappings
  library('KEGGREST')

  pathways <- unique(keggLink("pathway", org_abbreviation))
  kegg_pathways <- generate_kegg_pathway_mapping(pathways, CONFIG$verbose)
  gene_kegg_mapping <- generate_gene_kegg_mapping(pathways,
                                                  org_abbreviation,
                                                  verbose=CONFIG$verbose)

  # Save KEGG mapping
  if(!file.exists(input_dir)) {
      dir.create(input_dir, recursive=TRUE)
  }

  write.csv(gene_kegg_mapping, file=kegg_mapping_file, quote=FALSE,
          row.names=FALSE)
  write.table(kegg_pathways, file=kegg_pathways_file, quote=FALSE,
          row.names=FALSE, sep='\t')
}

# Rename gene/KEGG mapping columns to be consistent with GO mapping
colnames(gene_kegg_mapping) <- c('gene', 'category')
colnames(kegg_pathways)     <- c('category', 'name', 'class', 'description')

kegg_pathways <- unique(kegg_pathways)

# Preview of KEGG pathway annotations
if (CONFIG$include_tables) {
  kable(head(kegg_pathways), caption='Preview of KEGG annotations.')
}

Preview of KEGG annotations.
category	name	class	description
path:hsa00232	Caffeine metabolism	Metabolism; Biosynthesis of other secondary metabolites
path:hsa00983	Drug metabolism - other enzymes	Metabolism; Xenobiotics biodegradation and metabolism
path:hsa01100	Metabolic pathways
path:hsa05204	Chemical carcinogenesis	Human Diseases; Cancers	It has been estimated that exposure to environmental chemical carcinogens may contribute significantly to the causation of a sizable fraction, perhaps a majority, of human cancers. Human carcinogens act through a variety of genotoxic and non-genotoxic mechanisms. Genotoxic carcinogens can attack biological macromolecules such as DNA and RNA either directly or indirectly through metabolism, resulting in the formation of adducts with these macromolecules. If DNA adducts escape cellular repair mechanisms and persist, they may lead to miscoding, resulting in permanent mutations. Non-genotoxic carcinogens act by the mechanisms such as induction of inflammation, immunosuppression, formation of reactive oxygen species, activation of receptors, and epigenetic silencing. Together, these genotoxic and non-genotoxic mechanisms can alter signal-transduction pathways that finally result in hypermutability, genomic instability, loss of proliferation control, and resistance to apoptosis - some of the characteristic features of cancer cells.
path:hsa00230	Purine metabolism	Metabolism; Nucleotide metabolism
path:hsa05340	Primary immunodeficiency	Human Diseases; Immune diseases	Primary immunodeficiencies (PIs) are a heterogeneous group of disorders, which affect cellular and humoral immunity or non-specific host defense mechanisms mediated by complement proteins, and cells such as phagocytes and natural killer (NK) cells. These disorders of the immune system cause increased susceptibility to infection, autoimmune disease, and malignancy. Most of PIs are due to genetic defects that affect cell maturation or function at different levels during hematopoiesis. Disruption of the cellular immunity is observed in patients with defects in T cells or both T and B cells. These cellular immunodeficiencies comprise 20% of all PIs. Disorders of humoral immunity affect B-cell differentiation and antibody production. They account for 70% of all PIs.

4 Create ExpressionSet

Rather than keeping track of the count data and meta information all separately, let’s create a Bioconductor ExpressionSet instance containing all of the relevant information.

In order to create an ExpressionSet we first have to make sure each of the three main componenets (expression data, sample information, and feature information) have consistent column and row names and have been sorted similarly.

Some useful methods for working with ExpressionSets:

sampleNames(expr_set) // sample ids
featureNames(expr_set) // gene ids
exprs(expr_set) // counts
pData(expr_set) // sample metadata (id, condition, batch, etc)
fData(expr_set) // gene annotations

# Assay data (RNA-Seq count table)
assay_data <- as.matrix(count_table)

# Sort rows by gene id
assay_data <- assay_data[order(row.names(assay_data)),]

# Pheno data (sample information)
pheno_data <- new("AnnotatedDataFrame",
                  data.frame(sample=sample_ids, condition=condition, batch=batch))
sampleNames(pheno_data) <- colnames(assay_data)

# Feature data (gene annotations)
gene_info <- gene_info[order(gene_info$gene_id),]
feature_data <- new("AnnotatedDataFrame", as.data.frame(gene_info))

#featureNames(feature_data) <- rownames(assay_data)
featureNames(feature_data) <- gene_info$gene_id

# List to store counts used for differential expression analysis
de_counts = list()

de_counts$raw <- new("ExpressionSet", exprs=assay_data,
                     phenoData=pheno_data, featureData=feature_data)

print(de_counts$raw)

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 60664 features, 18 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: 1107_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam
##     1605_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam ...
##     WaGa_RNA_147Aligned.sortedByCoord.out.bam (18 total)
##   varLabels: sample condition batch
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSG00000000003 ENSG00000000005 ... ENSG00000288725
##     (60664 total)
##   fvarLabels: gene_id chromosome ... transcript_length (6 total)
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

5 Preview raw counts

5.1 Total number of reads

num_reads_total <- sum(as.numeric(exprs(de_counts$raw)))
cat(sprintf('\n**Total number of reads: %0.0f**\n\n', num_reads_total))

Total number of reads: 76153959

5.2 Highly expressed genes (before filtering)

gene_total_counts <- data.frame(sort(rowSums(exprs(de_counts$raw)), decreasing=TRUE))
colnames(gene_total_counts) <- c("total_reads")

# preview structure of gene annotations
gene_info_subset <- gene_info[match(rownames(gene_total_counts), gene_info$gene_id),]
gene_info_subset$num_reads <- gene_total_counts$total_reads
gene_info_subset$num_reads_pct <- (gene_total_counts$total_reads / num_reads_total) * 100

gene_info_subset <- gene_info_subset %>% 
    dplyr::select(-chromosome, -strand) %>%
    dplyr::rename(tx_len=transcript_length)

if (CONFIG$include_tables) {
    xkable(head(gene_info_subset, 10), 
        caption='Genes with the highest average expression before filtering.',
        str_max_width=20)
}

Genes with the highest average expression before filtering.
gene_id	description	type	tx_len	num_reads	num_reads_pct
ENSG00000198804	cytochrome c oxid…	protein_coding	NA	1304472	1.7129
ENSG00000198886	NADH dehydrogenas…	protein_coding	NA	1189403	1.5618
ENSG00000198712	cytochrome c oxid…	protein_coding	NA	789210	1.0363
ENSG00000198727	cytochrome b	protein_coding	NA	716157	0.9404
ENSG00000140259	microfibril assoc…	protein_coding	1441	708146	0.9299
ENSG00000198786	NADH dehydrogenas…	protein_coding	NA	658007	0.8640
ENSG00000198938	cytochrome c oxid…	protein_coding	NA	630656	0.8281
ENSG00000198888	NADH dehydrogenas…	protein_coding	NA	486080	0.6383
ENSG00000251562	metastasis associ…	lncRNA	2227	381826	0.5014
ENSG00000210082	NA	Mt_rRNA	NA	378582	0.4971

sum(exprs(de_counts$raw))

## [1] 76153959

6 Filter data

Next let’s filter the data to remove unwanted or uninformative genes.

Filtering targets:

low-count genes
noncoding RNAs

Later on (post-normalization), there may also be an addition step of variance-based filtering applied.

6.1 Remove low-count genes

# for relatively small datasets, we can look for genes with non-zero expression in at least
# as many samples are contained by the smallest condition group
reps_per_batch <- table(factor(condition))
min_replicates <- min(reps_per_batch)

# in some cases, very few replicates (possibly only a single one) are present;
# we will use a lower limit of 3 samples to ensure that at least a few of the samples
# pass the specific low count threshold
min_samples_passing_threshold <- max(min_replicates, 3)

# for larger datasets with many conditions (e.g. > 1,000 samples), this may be too conservative,
# so we will also set a lower limit equal to 5% times the total number of samples
min_samples_passing_threshold <- max(min_samples_passing_threshold, round(length(condition) / 20))

de_counts$raw <- filter_lowcount_genes(de_counts$raw, 
                                       threshold=CONFIG$low_count_threshold,
                                       min_samples=min_samples_passing_threshold, verbose=TRUE)

## [1] "Removing 6869 low-count genes (53795 remaining)."

sum(exprs(de_counts$raw))

## [1] 76133340

6.2 Filter noncoding RNAs

Next, we will remove ncRNAs such as rRNAs, tRNAs, and snoRNAs, which are not regulated in the same manner as other coding genes. Since these are not polyadenylated, they should not be present in the samples in large numbers.

# Filter by gene id or type
# Note that when filtering by type for T. cruzi, SRPs and SLRNAs are not included
ncrna_mask <- rep(FALSE, nrow(de_counts$raw))

if (!is.null(CONFIG$id_filter_string)) {
    ncrna_mask <- ncrna_mask | grepl(CONFIG$id_filter_string,
                                     rownames(de_counts$raw))
}

if (!is.null(CONFIG$type_filter_string)) {
    gene_types <- (gene_info %>% filter(gene_id %in% rownames(de_counts$raw)))$type
    ncrna_mask <- ncrna_mask | grepl(CONFIG$type_filter_string, gene_types)
}

num_noncoding_rnas <- sum(ncrna_mask)
print(sprintf("Removing %d non-coding RNAs", num_noncoding_rnas))

## [1] "Removing 0 non-coding RNAs"

de_counts$raw <- de_counts$raw[!ncrna_mask,]

# Keep annotations for remaining genes only
gene_info <- gene_info[gene_info$gene_id %in% rownames(de_counts$raw),]

cat('\n## Highly expressed genes (after filtering)\n')

6.3 Highly expressed genes (after filtering)

num_reads_total <- sum(as.numeric(exprs(de_counts$raw)))

gene_total_counts <- data.frame(sort(rowSums(exprs(de_counts$raw)), decreasing=TRUE))
colnames(gene_total_counts) <- c("total_reads")
gene_info_subset <- gene_info[match(rownames(gene_total_counts),
                                    gene_info$gene_id),]
gene_info_subset$num_reads <- gene_total_counts$total_reads
gene_info_subset$num_reads_pct <- (gene_total_counts$total_reads / num_reads_total) * 100

gene_info_subset <- gene_info_subset %>% 
    dplyr::select(-chromosome, -strand) %>%
    dplyr::rename(tx_len=transcript_length)

if (CONFIG$include_tables) {
    xkable(head(gene_info_subset, 10), 
        caption='Genes with the highest average expression after filtering.',
        str_max_width=20)
}

Genes with the highest average expression after filtering.
gene_id	description	type	tx_len	num_reads	num_reads_pct
ENSG00000198804	cytochrome c oxid…	protein_coding	NA	1304472	1.7134
ENSG00000198886	NADH dehydrogenas…	protein_coding	NA	1189403	1.5623
ENSG00000198712	cytochrome c oxid…	protein_coding	NA	789210	1.0366
ENSG00000198727	cytochrome b	protein_coding	NA	716157	0.9407
ENSG00000140259	microfibril assoc…	protein_coding	1441	708146	0.9301
ENSG00000198786	NADH dehydrogenas…	protein_coding	NA	658007	0.8643
ENSG00000198938	cytochrome c oxid…	protein_coding	NA	630656	0.8284
ENSG00000198888	NADH dehydrogenas…	protein_coding	NA	486080	0.6385
ENSG00000251562	metastasis associ…	lncRNA	2227	381826	0.5015
ENSG00000210082	NA	Mt_rRNA	NA	378582	0.4973

dim(de_counts$raw)

## Features  Samples 
##    53795       18

sum(exprs(de_counts$raw))

## [1] 76133340

7 Create DESeqDataSet

To analyze and graph the data using the DESeq2 package, we have to change the data format from an ExpressionSet to a DESeqDataSet object and identify the factor in the experimental design that we want to compare for our differential expression analysis. This requires a few steps. (https://colauttilab.github.io/RNA-Seq_Tutorial.html#deseq2_analysis)

First, extract raw count matrix from ExpressionSet

#expr_se <- SummarizedExperiment(exprs(de_counts$raw))
#head(expr_se)
d.raw <- exprs(de_counts$raw)

Second, we prepare a data frame containing metadata about each sample.

#colData(expr_se) <- DataFrame(pData(de_counts$raw))
cData <- data.frame(pData(de_counts$raw))

Third, construct a DESeq object from the prepared data.

#https://support.bioconductor.org/p/64436/  (Why ~condition not ~0+condition, betaPrior default is FALSE?)
#dds <- DESeqDataSet(expr_se, design=~condition)
dds <- DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~condition)

8 Estimate size factors

# BUG: a following function causes select(!!CONFIG$covariates) erroneous!
library("gridExtra")

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:Biobase':
## 
##     combine

## The following object is masked from 'package:BiocGenerics':
## 
##     combine

library("mixOmics")

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked _by_ '.GlobalEnv':
## 
##     select

## The following object is masked from 'package:OrganismDbi':
## 
##     select

## The following object is masked from 'package:clusterProfiler':
## 
##     select

## The following object is masked from 'package:genefilter':
## 
##     area

## The following object is masked from 'package:dplyr':
## 
##     select

## The following object is masked from 'package:AnnotationDbi':
## 
##     select

## The following object is masked from 'package:biomaRt':
## 
##     select

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:ReactomePA':
## 
##     dotplot

## The following object is masked from 'package:clusterProfiler':
## 
##     dotplot

## 
## Loaded mixOmics 6.20.0
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us:  citation('mixOmics')

#library("DESeq")
library("edgeR")
library("VennDiagram")

## Loading required package: grid

## Loading required package: futile.logger

## 
## Attaching package: 'futile.logger'

## The following object is masked from 'package:mgcv':
## 
##     scat

library("devtools")

## Loading required package: usethis

sizeFactors(dds)

## NULL

dds <- estimateSizeFactors(dds)
head(sizeFactors(dds))

##  1107_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             0.8411 
##  1605_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             0.9430 
##  2706_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             1.0928 
## 1107_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             0.8531 
## 1605_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             0.8503 
## 2706_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             1.1101

# raw and normalized counts
raw_counts <- counts(dds)
normalized_counts <- counts(dds, normalized=TRUE)
write.table(raw_counts, file="tables/raw_counts.txt", sep="\t", quote=F, col.names=NA)
write.table(normalized_counts, file="tables/normalized_counts.txt", sep="\t", quote=F, col.names=NA)

# prepare the function estimSf
### Let's implement such a function
### cds is a countDataset
estimSf <- function (cds){
    # Get the count matrix
    cts <- counts(cds)
    
    # Compute the geometric mean
    geomMean <- function(x) prod(x)^(1/length(x))

    # Compute the geometric mean over the line
    gm.mean  <-  apply(cts, 1, geomMean)
    
    # Zero values are set to NA (avoid subsequentcdsdivision by 0)
    gm.mean[gm.mean == 0] <- NA
    
    # Divide each line by its corresponding geometric mean
    # sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...)
    # MARGIN: 1 or 2 (line or columns)
    # STATS: a vector of length nrow(x) or ncol(x), depending on MARGIN
    # FUN: the function to be applied
    cts <- sweep(cts, 1, gm.mean, FUN="/")
    
    # Compute the median over the columns
    med <- apply(cts, 2, median, na.rm=TRUE)
    
    # Return the scaling factor
    return(med)
}
#https://dputhier.github.io/ASG/practicals/rnaseq_diff_Snf2/rnaseq_diff_Snf2.html
#http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization
#https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html
#https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html
#https://genviz.org/module-04-expression/0004/02/01/DifferentialExpression/
#DESeq2’s median of ratios [1]
#EdgeR’s trimmed mean of M values (TMM) [2]
#http://www.nathalievialaneix.eu/doc/html/TP1_normalization.html
test_normcount <- sweep(raw_counts, 2, sizeFactors(dds), "/")
sum(test_normcount != normalized_counts)

## [1] 0

head(estimSf(dds))

##  1107_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             0.8411 
##  1605_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             0.9430 
##  2706_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             1.0928 
## 1107_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             0.8531 
## 1605_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             0.8503 
## 2706_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam 
##                                             1.1101

all(round(estimSf(dds),6) == round(sizeFactors(dds), 6))

## [1] TRUE

9 Basic exploratory analysis

# experimental design
design <- cData
#design
#design_cond_only
#design_condition_only
#design_incl_batch
#design_including_batch

9.1 Raw counts and Pseudo counts

raw_counts_wn <- raw_counts[rowSums(raw_counts) > 0, ] #already filtered before, here should filter nothing.
dim(raw_counts_wn)

## [1] 53795    18

It is often useful, to visualize the count distribution, to compute “pseudo counts”, which are log-transformed counts:

pseudo_counts <- log2(raw_counts_wn + 1)
head(pseudo_counts)

	1107_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1605_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	2706_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1107_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1605_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	2706_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1107_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1605_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	2706_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1107_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1605_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	2706_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1107_WaGa_wt_EVAligned.sortedByCoord.out.bam	1605_WaGa_wt_EVAligned.sortedByCoord.out.bam	2706_WaGa_wt_EVAligned.sortedByCoord.out.bam	WaGa_RNAAligned.sortedByCoord.out.bam	WaGa_RNA_118Aligned.sortedByCoord.out.bam	WaGa_RNA_147Aligned.sortedByCoord.out.bam
ENSG00000000003	5.392	5.672	5.883	5.044	5.210	5.285	5.426	5.248	5.728	4.954	4.322	5.392	5.322	6.150	3.907	6.539	6.755	7.210
ENSG00000000005	3.170	3.585	3.907	2.322	3.000	2.807	3.322	2.322	3.459	3.585	2.322	2.807	2.322	3.000	3.000	0.000	1.000	0.000
ENSG00000000419	6.600	6.794	7.313	6.426	6.508	7.011	7.022	7.686	7.340	6.742	6.700	6.088	7.665	7.942	6.686	9.377	9.185	8.948
ENSG00000000457	6.044	6.109	6.190	6.443	5.615	6.658	6.539	5.170	5.728	5.755	5.672	5.672	6.285	5.615	5.248	7.593	7.267	6.954
ENSG00000000460	5.728	6.150	6.170	4.755	5.322	5.833	6.267	6.304	5.858	5.492	5.931	6.066	5.781	6.190	5.210	8.814	8.061	7.846
ENSG00000000938	6.109	6.066	6.340	5.615	6.248	6.539	5.858	6.304	5.954	5.392	6.267	6.742	6.170	5.555	5.954	2.000	1.000	2.000

df_raw <- melt(pseudo_counts, id = rownames(raw_counts_wn))
names(df_raw)[1:2]<- c("id", "sample")
df_raw$method <- rep("Raw counts", nrow(df_raw))  
#head(df_raw)

The object df_raw will be used later to compare the effect of different normalization on the count distribution in the different samples.

9.2 Count distribution

First, we provide an overview of the distribution of the first sample (without the genes that have been filtered out) by ploting the histograms of raw counts and pseudo counts:

df <- data.frame(rcounts = raw_counts_wn[ ,1], prcounts = pseudo_counts[ ,1])

p <- ggplot(data=df, aes(x = rcounts, y = ..density..))
p <- p + geom_histogram(fill = "lightblue")
p <- p + theme_bw()
p <- p + ggtitle(paste0("count distribution '", design$sample[1], "'"))
p <- p + xlab("counts")

p2 <- ggplot(data=df, aes(x = prcounts, y = ..density..))
p2 <- p2 + geom_histogram(fill = "lightblue")
p2 <- p2 + theme_bw()
p2 <- p2 + ggtitle(paste0("count distribution - '", design$sample[1], "'"))
p2 <- p2 + xlab(expression(log[2](counts + 1)))

grid.arrange(p, p2, ncol = 2)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This figure shows that the count distribution is highly skewed with a large proportion of genes with a count equal to 0 and a few number of genes with an extreme count.

9.3 Relation between mean and variance

Another important feature of RNA-Seq data is the fact that they are overdispersed. This means that the variance of the count of a given gene over different biological samples within a given condition is larger than the average count for the same gene. This feature is illustrated by plotting the graphics of the mean vs variance for condition “wt” for all genes with an average count smaller than 5000 (otherwise the chart can not be read easily):

df <- data.frame(mean = apply(raw_counts_wn[ ,design$condition == "wt.EV"], 1, mean),
                 var = apply(raw_counts_wn[ ,design$condition == "wt.EV"], 1, var))
df <- df[df$mean <= 5000, ]
p <- ggplot(data=df, aes(x = mean, y = var))
p <- p + geom_point(colour = "orange")
p <- p + theme_bw()
p <- p + geom_abline(aes(intercept=0, slope=1))
p <- p + ggtitle("Variance versus mean in counts") + ylab("variance")
print(p)

In this figure, the black line is the “y=x” diagonal. It is easy to see that, for most genes, the variance is much larger than the mean.

10 Normalization

This section performs different normalization approaches for RNA-Seq data. These normalization are performed using either DESeq (RLE) or edgeR (TC, RPKM, UQ, TMM). The final count distribution is compared to the raw count distribution in the last part of this section.

10.1 DESeq2

Normalized counts can be obtained using the function counts with the option normalized = TRUE. The calculation of the normalized counts is performed with the formula K~gj=Kgj/sj. The following command lines compare the result of the counts function with this calculation:

deseq_normcount <- counts(dds, normalized = TRUE)
test_normcount <- sweep(raw_counts_wn, 2, sizeFactors(dds), "/")
sum(test_normcount != deseq_normcount)

## [1] 0

Finally, pseudo counts are computed and stored in a data frame for further comparison.

pseudo_deseq <- log2(deseq_normcount + 1)
df_deseq <- melt(pseudo_deseq, id = rownames(raw_counts_wn))
names(df_deseq)[1:2]<- c("id", "sample")
df_deseq$method <- rep("DESeq2 (RLE)", nrow(df_raw))

10.2 edgeR

In this section, the package edgeR is perform to compare the other normalization approaches. To do so, a DGEList object has to be created from the count data:

dge2 <- DGEList(raw_counts_wn)

The library sizes and the normalization factors for all samples are obtained by:

dge2$samples

	group	lib.size	norm.factors
1107_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1	2514456	1
1605_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1	2917888	1
2706_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1	3829508	1
1107_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1	2397527	1
1605_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1	2660616	1
2706_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1	3739227	1
1107_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1	2771568	1
1605_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1	4102294	1
2706_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1	3446136	1
1107_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1	2226308	1
1605_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1	2680865	1
2706_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1	3354354	1
1107_WaGa_wt_EVAligned.sortedByCoord.out.bam	1	5828110	1
1605_WaGa_wt_EVAligned.sortedByCoord.out.bam	1	5599417	1
2706_WaGa_wt_EVAligned.sortedByCoord.out.bam	1	2506633	1
WaGa_RNAAligned.sortedByCoord.out.bam	1	9700209	1
WaGa_RNA_118Aligned.sortedByCoord.out.bam	1	8042977	1
WaGa_RNA_147Aligned.sortedByCoord.out.bam	1	7815247	1

10.3 Total count (TC)

Normalization by Total Count (TC) is obtained directly by the function cpm. Pseudo-counts (log2 transformed counts) are computed and stored in a data frame for later use.

pseudo_TC <- log2(cpm(dge2) + 1)

df_TC <- melt(pseudo_TC, id = rownames(raw_counts_wn))
names(df_TC)[1:2] <- c ("id", "sample")
df_TC$method <- rep("TC", nrow(df_TC))

10.4 RPKM

RPKM needs information about gene lengths which have to be passed to the function rpkm. Again, pseudo-counts are computed and stored in a data frame for further use.

# prepare the function gene_lengths
ensembl_list <- gene_info$gene_id
human <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
gene_coords=getBM(attributes=c("hgnc_symbol","ensembl_gene_id", "start_position","end_position"), filters="ensembl_gene_id", values=ensembl_list, mart=human)
gene_coords$size=gene_coords$end_position - gene_coords$start_position
head(gene_coords)

hgnc_symbol	ensembl_gene_id	start_position	end_position	size
TSPAN6	ENSG00000000003	100627108	100639991	12883
TNMD	ENSG00000000005	100584936	100599885	14949
DPM1	ENSG00000000419	50934867	50959140	24273
SCYL3	ENSG00000000457	169849631	169894267	44636
C1orf112	ENSG00000000460	169662007	169854080	192073
FGR	ENSG00000000938	27612064	27635185	23121

my_values <- gene_coords$size
my_names <- gene_coords$ensembl_gene_id
gene_lengths <- setNames(my_values, my_names)

gene_lengths_wn <- gene_lengths[rowSums(raw_counts) > 0]
pseudo_RPKM <- log2(rpkm(dge2, gene.length = gene_lengths_wn) + 1)

df_RPKM <- melt(pseudo_RPKM, id = rownames(raw_counts_wn))
names(df_RPKM)[1:2] <- c ("id", "sample")
df_RPKM$method <- rep("RPKM", nrow(df_RPKM))

10.5 Upper quartile (UQ)

Upper quartile normalization is obtained with the function calcNormFactors. This function changes the variable norm.factors in the DGEList object and thus also the way the functions cpm and rpkm are computing counts: now, a normalized library size, equal to the original library size multiplied by the normalization factor, is used to compute normalized counts: K~gj=Kgj/(modified LS)∗10^6. The normalization factors are computed as follows. Note that size.factors = lib.size*norm.factors/10^6.

dge2 <- calcNormFactors(dge2, method = "upperquartile")
dge2$samples

	group	lib.size	norm.factors
1107_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1	2514456	1.5370
1605_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1	2917888	1.4654
2706_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1	3829508	1.1810
1107_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1	2397527	1.5776
1605_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1	2660616	1.4216
2706_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1	3739227	1.2974
1107_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1	2771568	1.4537
1605_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1	4102294	0.8819
2706_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1	3446136	1.2169
1107_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1	2226308	1.5697
1605_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1	2680865	1.5643
2706_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1	3354354	1.3727
1107_WaGa_wt_EVAligned.sortedByCoord.out.bam	1	5828110	0.9030
1605_WaGa_wt_EVAligned.sortedByCoord.out.bam	1	5599417	0.6755
2706_WaGa_wt_EVAligned.sortedByCoord.out.bam	1	2506633	1.4106
WaGa_RNAAligned.sortedByCoord.out.bam	1	9700209	0.3391
WaGa_RNA_118Aligned.sortedByCoord.out.bam	1	8042977	0.2965
WaGa_RNA_147Aligned.sortedByCoord.out.bam	1	7815247	0.2841

The previous formula is compared to the result of the function cpm in the following command lines:

test_normcount <- sweep(dge2$counts, 2,
                        dge2$samples$lib.size*dge2$samples$norm.factors / 10^6,
                        "/")
range(as.vector(test_normcount - cpm(dge2)))

## [1] -7.276e-12  7.276e-12

which shows no difference. Finally, pseudo counts are obtained and stored in a data frame:

pseudo_UQ <- log2(cpm(dge2) + 1)

df_UQ <- melt(pseudo_UQ, id = rownames(raw_counts_wn))
names(df_UQ)[1:2] <- c ("id", "sample")
df_UQ$method <- rep("UQ", nrow(df_UQ))

10.6 TMM

TMM normalization works similarly as UQ and updates the value of the variable norm.factors:

dge2 <- calcNormFactors(dge2, method = "TMM")
dge2$samples

	group	lib.size	norm.factors
1107_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1	2514456	1.3456
1605_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1	2917888	1.2298
2706_WaGa_sT_DMSO_EVAligned.sortedByCoord.out.bam	1	3829508	1.0654
1107_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1	2397527	1.3707
1605_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1	2660616	1.1780
2706_WaGa_scr_DMSO_EVAligned.sortedByCoord.out.bam	1	3739227	1.0892
1107_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1	2771568	1.2840
1605_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1	4102294	0.8046
2706_WaGa_sT_Dox_EVAligned.sortedByCoord.out.bam	1	3446136	1.0757
1107_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1	2226308	1.4004
1605_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1	2680865	1.3521
2706_WaGa_scr_Dox_EVAligned.sortedByCoord.out.bam	1	3354354	1.1472
1107_WaGa_wt_EVAligned.sortedByCoord.out.bam	1	5828110	0.8482
1605_WaGa_wt_EVAligned.sortedByCoord.out.bam	1	5599417	0.6435
2706_WaGa_wt_EVAligned.sortedByCoord.out.bam	1	2506633	1.2032
WaGa_RNAAligned.sortedByCoord.out.bam	1	9700209	0.5494
WaGa_RNA_118Aligned.sortedByCoord.out.bam	1	8042977	0.5983
WaGa_RNA_147Aligned.sortedByCoord.out.bam	1	7815247	0.6189

Normalized pseudo counts are obtained with the function cpm and stored in a data frame:

pseudo_TMM <- log2(cpm(dge2) + 1)

df_TMM <- melt(pseudo_TMM, id = rownames(raw_counts_wn))
names(df_TMM)[1:2] <- c ("id", "sample")
df_TMM$method <- rep("TMM", nrow(df_TMM))

10.7 Comparison

This last section shows the comparison between all normalization methods (and original raw data) for this data set. First, the distributions of all samples and all normalization methods are compared with boxplots and density plots:

df_allnorm <- rbind(df_raw, df_deseq, df_TC, df_RPKM, df_UQ, df_TMM)
df_allnorm$method <- factor(df_allnorm$method,
                            levels = c("Raw counts", "DESeq2 (RLE)", "TC", 
                                       "RPKM", "TMM", "UQ"))

p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method))
p <- p + geom_boxplot()  
p <- p + theme_bw()
p <- p + ggtitle("Boxplots of normalized pseudo counts\n
for all samples by normalization methods")
p <- p + facet_grid(. ~ method) 
p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("")
p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
               axis.ticks.x = element_blank())
print(p)

## Warning: Removed 1836 rows containing non-finite values (`stat_boxplot()`).

11 PCA

Finally, a PCA is performed on the pseudo counts obtained after DESeq2 (RLE) normalization:

colnames(pseudo_deseq) <- c("1107_sT_DMSO","1605_sT_DMSO","2706_sT_DMSO","1107_scr_DMSO","1605_scr_DMSO","2706_scr_DMSO","1107_sT_Dox","1605_sT_Dox","2706_sT_Dox","1107_scr_Dox","1605_scr_Dox","2706_scr_Dox","1107_wt","1605_wt","2706_wt","RNA","RNA_118","RNA_147")
resPCA <- pca(t(pseudo_deseq), ncomp = 12)
plot(resPCA)

The first component explains more variance than the other components which almost all equally reproduce variance. The projection of the samples on the first two PCs is obtained with:

plotIndiv(resPCA, group = design$condition, col.per.group = brewer.pal(6, "Dark2"))

12 Differential Expression Analysis

Finally, we are ready to analyze the data by running the DESeq function on the DESeqDataSet object. The model below will take a few seconds to run. The details of this analysis are a bit complicated, but worth reading in detail if you want to do this for a project or publication (use ?DESeq and look up the references therein).

The analysis is complicated because sequencing is essentially a process of random sampling, so two genes may look like they are differentially expressed between two samples just because one sample has more overall RNA or because one sample has another gene ‘hogging’ all the sequence reads. So it’s not possible to compare one gene across samples without also considering all the other genes in those samples.

dds$condition <- relevel(dds$condition, "scr.DMSO.EV")
de_results <- DESeq(dds)  #betaPrior=FALSE or none, since it is default.

## using pre-existing size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

Once the analysis is done, we can extract a table of results using the results() function. This is useful for other downstream analyses including visualization.

resultsNames(de_results)

## [1] "Intercept"                           "condition_RNA_vs_scr.DMSO.EV"       
## [3] "condition_scr.Dox.EV_vs_scr.DMSO.EV" "condition_sT.DMSO.EV_vs_scr.DMSO.EV"
## [5] "condition_sT.Dox.EV_vs_scr.DMSO.EV"  "condition_wt.EV_vs_scr.DMSO.EV"

clist <- c("sT.DMSO.EV_vs_scr.DMSO.EV")
DEdat <- results(de_results)
head(DEdat)

## log2 fold change (MLE): condition wt.EV vs scr.DMSO.EV 
## Wald test p-value: condition wt.EV vs scr.DMSO.EV 
## DataFrame with 6 rows and 6 columns
##                  baseMean log2FoldChange     lfcSE      stat    pvalue
##                 <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003  47.26900      -0.125133  0.402798 -0.310660 0.7560593
## ENSG00000000005   6.60857      -0.133363  0.762828 -0.174827 0.8612156
## ENSG00000000419 176.46105       0.556557  0.271188  2.052294 0.0401411
## ENSG00000000457  70.91095      -0.831785  0.330742 -2.514909 0.0119063
## ENSG00000000460  83.37109       0.155099  0.305115  0.508328 0.6112234
## ENSG00000000938  58.98183      -0.502072  0.328987 -1.526116 0.1269809
##                      padj
##                 <numeric>
## ENSG00000000003 0.8570386
## ENSG00000000005        NA
## ENSG00000000419 0.1726800
## ENSG00000000457 0.0887135
## ENSG00000000460 0.7588803
## ENSG00000000938 0.3214849

baseMean – this is the average gene expression for the two groups.
log2FoldChange – raise this number to the exponent 2 to get the fractional difference. For example, -2.32 means a ratio of 2^(-2.32) or 0.2 (Male vs Female), so the gene would be expressed higher in the male than the female.
lfcSE – this is the standard error of the estimate of the log2FoldChange
pvalue – this is the probability of getting the observed log2FoldChange just by randomly sampling sequences. This depends on the magnitude of the difference relative to the total number of sequences.
padj – this is an adjusted p-value. The p-value must be adjusted to account for the fact that we are looking at many genes. For example, if we have 10,000 genes, so we expect to get P < 0.05 about 5% of the time, so we would get 500 genes with P < 0.05 just by chance (i.e. 0.05 * 10,000).