## ----setup, include=FALSE------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----loadPackages, message=FALSE, warning=FALSE--------------------------
library(ggplot2)
library(gridExtra)
library(reshape2)
library(mixOmics)
library(RColorBrewer)
library(DESeq)
library(edgeR)
library(VennDiagram)
library(devtools)

## ----importData, message=FALSE-------------------------------------------
raw_counts <- read.table("../data/D1-counts.txt", header = TRUE, row.names = 1)
raw_counts <- as.matrix(raw_counts)
design <- read.table("../data/D1-targets.txt", header = TRUE, 
                     stringsAsFactors = FALSE)
gene_lengths <- scan("../data/D1-genesLength.txt")

## ----basicCountDesc------------------------------------------------------
dim(raw_counts)

## ----headCounts----------------------------------------------------------
head(raw_counts)

## ----basicDesignDesc-----------------------------------------------------
design

## ----filterCounts--------------------------------------------------------
raw_counts_wn <- raw_counts[rowSums(raw_counts) > 0, ]
dim(raw_counts_wn)

## ----pseudoCounts--------------------------------------------------------
pseudo_counts <- log2(raw_counts_wn + 1)
head(pseudo_counts)

df_raw <- melt(pseudo_counts, id = rownames(raw_counts_wn))
names(df_raw)[1:2]<- c("id", "sample")
df_raw$method <- rep("Raw counts", nrow(df_raw))  
head(df_raw)

## ----histoCount, warning=FALSE, message=FALSE----------------------------
df <- data.frame(rcounts = raw_counts_wn[ ,1], prcounts = pseudo_counts[ ,1])

p <- ggplot(data=df, aes(x = rcounts, y = ..density..))
p <- p + geom_histogram(fill = "lightblue")
p <- p + theme_bw()
p <- p + ggtitle(paste0("count distribution '", design$labels[1], "'"))
p <- p + xlab("counts")

p2 <- ggplot(data=df, aes(x = prcounts, y = ..density..))
p2 <- p2 + geom_histogram(fill = "lightblue")
p2 <- p2 + theme_bw()
p2 <- p2 + ggtitle(paste0("count distribution - '", design$labels[1], "'"))
p2 <- p2 + xlab(expression(log[2](counts + 1)))

grid.arrange(p, p2, ncol = 2)

## ----saveHist, echo=FALSE, fig.show='hide', message=FALSE, results='hide'----
grid.arrange(p, p2, ncol = 2)
dev.print(png, file = "../images/count-hist.png", width=1000, height=600,
          res=150)

## ----variabilityPlot-----------------------------------------------------
df <- data.frame(mean = apply(raw_counts_wn[ ,design$group == "wt"], 1, mean),
                 var = apply(raw_counts_wn[ ,design$group == "wt"], 1, var))
df <- df[df$mean <= 5000, ]
p <- ggplot(data=df, aes(x = mean, y = var))
p <- p + geom_point(colour = "orange")
p <- p + theme_bw()
p <- p + geom_abline(aes(intercept=0, slope=1))
p <- p + ggtitle("Variance versus mean in counts") + ylab("variance")
print(p)

## ----saveVarPlot, echo=FALSE, fig.show='hide', message=FALSE, results='hide'----
print(p)
dev.print(png, file = "../images/var-vs-mean.png", width=600, height=600,
          res=150)

## ----PCA, cache = TRUE---------------------------------------------------
resPCA <- pca(t(pseudo_counts), ncomp = 12)
plot(resPCA)

## ----plotIndPCA, warning = FALSE-----------------------------------------
plotIndiv(resPCA, group = design$group, col.per.group = brewer.pal(4, "Dark2"))

## ----prepareDESeq--------------------------------------------------------
dge <- newCountDataSet(raw_counts_wn, conditions = design$group)
dge

## ----originalSizeDESeq---------------------------------------------------
sizeFactors(dge)

## ----normDESeq-----------------------------------------------------------
dge <- estimateSizeFactors(dge)
sizeFactors(dge)

## ----normCountsDESeq-----------------------------------------------------
deseq_normcount <- counts(dge, normalized = TRUE)
test_normcount <- sweep(raw_counts_wn, 2, sizeFactors(dge), "/")
sum(test_normcount != deseq_normcount)

## ----dfDESeq-------------------------------------------------------------
pseudo_deseq <- log2(deseq_normcount + 1)
df_deseq <- melt(pseudo_deseq, id = rownames(raw_counts_wn))
names(df_deseq)[1:2]<- c("id", "sample")
df_deseq$method <- rep("DESeq (RLE)", nrow(df_raw))  

## ----prepareDataEdgeR----------------------------------------------------
dge2 <- DGEList(raw_counts_wn)
dge2

## ----edgeRLibSize--------------------------------------------------------
dge2$samples

## ----normTC--------------------------------------------------------------
pseudo_TC <- log2(cpm(dge2) + 1)

df_TC <- melt(pseudo_TC, id = rownames(raw_counts_wn))
names(df_TC)[1:2] <- c ("id", "sample")
df_TC$method <- rep("TC", nrow(df_TC))

## ----RPKM----------------------------------------------------------------
gene_lengths_wn <- gene_lengths[rowSums(raw_counts) > 0]
pseudo_RPKM <- log2(rpkm(dge2, gene.length = gene_lengths_wn) + 1)

df_RPKM <- melt(pseudo_RPKM, id = rownames(raw_counts_wn))
names(df_RPKM)[1:2] <- c ("id", "sample")
df_RPKM$method <- rep("RPKM", nrow(df_RPKM))

## ----normUQ--------------------------------------------------------------
dge2 <- calcNormFactors(dge2, method = "upperquartile")
dge2$samples

## ----compareUQ-----------------------------------------------------------
test_normcount <- sweep(dge2$counts, 2,
                        dge2$samples$lib.size*dge2$samples$norm.factors / 10^6,
                        "/")
range(as.vector(test_normcount - cpm(dge2)))

## ----pseudoUQ------------------------------------------------------------
pseudo_UQ <- log2(cpm(dge2) + 1)

df_UQ <- melt(pseudo_UQ, id = rownames(raw_counts_wn))
names(df_UQ)[1:2] <- c ("id", "sample")
df_UQ$method <- rep("UQ", nrow(df_UQ))

## ----TMM-----------------------------------------------------------------
dge2 <- calcNormFactors(dge2, method = "TMM")
dge2$samples

## ----pseudoTMM-----------------------------------------------------------
pseudo_TMM <- log2(cpm(dge2) + 1)

df_TMM <- melt(pseudo_TMM, id = rownames(raw_counts_wn))
names(df_TMM)[1:2] <- c ("id", "sample")
df_TMM$method <- rep("TMM", nrow(df_TMM))

## ----boxCompareNorm------------------------------------------------------
df_allnorm <- rbind(df_raw, df_deseq, df_TC, df_RPKM, df_UQ, df_TMM)
df_allnorm$method <- factor(df_allnorm$method,
                            levels = c("Raw counts", "DESeq (RLE)", "TC", 
                                       "RPKM", "TMM", "UQ"))

p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method))
p <- p + geom_boxplot()  
p <- p + theme_bw()
p <- p + ggtitle("Boxplots of normalized pseudo counts\n
for all samples by normalization methods")
p <- p + facet_grid(. ~ method) 
p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("")
p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
               axis.ticks.x = element_blank())
print(p)

## ----saveBoxComparePlot, echo=FALSE, fig.show='hide', message=FALSE, results='hide'----
print(p)
dev.print(png, file = "../images/norm_comparison.png", width=1000, height=600,
          res=150)

## ----densityCompareNorm, fig.width=10------------------------------------
p <- ggplot(data=df_allnorm, aes(x=value, colour=sample))
p <- p + geom_density()  
p <- p + theme_bw()
p <- p + ggtitle("Density of normalized pseudo counts\n
for all samples by normalization methods")
p <- p + facet_grid(. ~ method) 
p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("")
p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
               axis.ticks.x = element_blank())
print(p)

## ----normPCA, cache = TRUE-----------------------------------------------
resPCA <- pca(t(pseudo_TMM), ncomp = 12)
plot(resPCA)

## ----plotIndNormPCA, warning = FALSE-------------------------------------
plotIndiv(resPCA, group = design$group, col.per.group = brewer.pal(4, "Dark2"))

## ----pairwiseExact, cache = TRUE-----------------------------------------
all_conditions <- setdiff(unique(design$group), "wt")
DEG_pairwiseExact <- NULL
pvals_pairwiseExact <- NULL
par(mfrow = c(1,3))
for (ind in seq_along(all_conditions)) {
  # create dataset with 'wt' and current mutant and normalize (TMM)
  sel_cols <- union(grep("wt", colnames(raw_counts_wn)),
                    grep(all_conditions[ind], colnames(raw_counts_wn)))
  cur_counts <- raw_counts_wn[ ,sel_cols]
  group <- relevel(as.factor(design$group[sel_cols]), ref = "wt")
  cur_dge <- DGEList(cur_counts, group = group)
  cur_dge <- calcNormFactors(cur_dge, method = "TMM")
  
  # estimate dispersions
  cur_dge <- estimateCommonDisp(cur_dge)
  cur_dge <- estimateTagwiseDisp(cur_dge)
  plotBCV(cur_dge, 
          main = paste0("BCV plot 'wt' vs '", all_conditions[ind], "'"))
  
  # exact test
  res_et <- exactTest(cur_dge)
  cat("Exact test results:\n")
  print(res_et)
  pvals_pairwiseExact <- c(pvals_pairwiseExact, res_et$table$PValue,
                           p.adjust(res_et$table$PValue, method = "BH"))

  cat("Top 10 DEG for 'wt' vs '", all_conditions[ind], "':\n")
  print(topTags(res_et))
  
  cur_res <- decideTestsDGE(res_et, adjust.method = "BH", p.value = 0.05)
  print(cur_res)
  sel_deg <- which(cur_res[ ,1] != 0)
  cur_res <- cbind(rownames(cur_counts)[sel_deg], cur_res[sel_deg,1],
                   rep(all_conditions[ind], length(sel_deg)))
  DEG_pairwiseExact <- rbind(DEG_pairwiseExact, cur_res)
}

## ----exactTestPost-------------------------------------------------------
DEG_pairwiseExact <- as.data.frame(DEG_pairwiseExact)
names(DEG_pairwiseExact) <- c("name", "UD", "condition")
listDEG_pairwiseExact <- unique(DEG_pairwiseExact$name)

pvals_pairwiseExact <- data.frame("pvalue" = pvals_pairwiseExact,
                                  "type" = rep(rep(c("raw", "adjusted"),
                                                   each = nrow(raw_counts_wn)),
                                               length(all_conditions)),
                                  "condition" = rep(all_conditions,
                                                    each=nrow(raw_counts_wn)*2))

## ----resultsExactTest----------------------------------------------------
table(DEG_pairwiseExact$condition)
table(DEG_pairwiseExact$condition, DEG_pairwiseExact$UD) # 1 means 'up-regulated'
length(listDEG_pairwiseExact)

## ----histExactTest, warning = FALSE, message = FALSE---------------------
p <- ggplot(data = pvals_pairwiseExact, aes(x = pvalue, fill = type))
p <- p + geom_histogram()
p <- p + theme_bw()
p <- p + ggtitle("Histograms of raw/adjusted p-values for exact test")
p <- p + facet_grid(type ~ condition) 
p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
               axis.ticks.x = element_blank())
print(p)

## ----GLM1, cache = TRUE--------------------------------------------------
# create dataset with 'wt' and current mutant and normalize (TMM)
cur_dge <- DGEList(raw_counts_wn, group = design$group)
cur_dge <- calcNormFactors(cur_dge, method = "TMM")

## ----GLM1DesignMatrix----------------------------------------------------
group <- relevel(as.factor(design$group), ref = "wt")
design_matrix <- model.matrix(~ group)
design_matrix

## ----GLM1EstimeDisp, cache = TRUE----------------------------------------
cur_dge <- estimateGLMCommonDisp(cur_dge, design_matrix)
cur_dge <- estimateGLMTrendedDisp(cur_dge, design_matrix)
cur_dge <- estimateGLMTagwiseDisp(cur_dge, design_matrix)
plotBCV(cur_dge, main = paste0("BCV plot"))

## ----GLM1Fit-------------------------------------------------------------
# GLM fit
fit <- glmFit(cur_dge, design_matrix)
DEG_GLM1 <- NULL
pvals_GLM1 <- NULL
for (ind in 1:3) {
  res_GLM1 <- glmLRT(fit, coef = ind+1)
  pvals_GLM1 <- c(pvals_GLM1, res_GLM1$table$PValue,
                  p.adjust(res_GLM1$table$PValue, method = "BH"))

  cat("Top 10 DEG for 'wt' vs '", all_conditions[ind], "':\n")
  print(topTags(res_GLM1))
  
  cur_res <- decideTestsDGE(res_GLM1, adjust.method = "BH", p.value = 0.05)
  print(cur_res)
  sel_deg <- which(cur_res[ ,1] != 0)
  cur_res <- cbind(rownames(raw_counts_wn)[sel_deg], cur_res[sel_deg,1],
                   rep(all_conditions[ind], length(sel_deg)))
  DEG_GLM1 <- rbind(DEG_GLM1, cur_res)
}
DEG_GLM1 <- as.data.frame(DEG_GLM1)
names(DEG_GLM1) <- c("name", "UD", "condition")
listDEG_GLM1 <- unique(DEG_GLM1$name)

pvals_GLM1 <- data.frame("pvalue" = pvals_GLM1,
                         "type" = rep(rep(c("raw", "adjusted"),
                                          each = nrow(raw_counts_wn)),
                                      length(all_conditions)),
                         "condition" = rep(all_conditions,
                                           each=nrow(raw_counts_wn)*2))

## ----histGLM1, message = FALSE-------------------------------------------
p <- ggplot(data = pvals_GLM1, aes(x = pvalue, fill = type))
p <- p + geom_histogram()
p <- p + theme_bw()
p <- p + ggtitle("Histogram of raw/adjusted p-values for exact test")
p <- p + facet_grid(type ~ condition) 
p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
               axis.ticks.x = element_blank())
print(p)

## ----resultspGLM1--------------------------------------------------------
table(DEG_GLM1$condition)
table(DEG_GLM1$condition, DEG_GLM1$UD) # 1 means 'up-regulated'
length(listDEG_GLM1)

## ----GLM2, cache = TRUE--------------------------------------------------
# create dataset with 'wt' and current mutant and normalize (TMM)
cur_dge <- DGEList(raw_counts_wn, group = design$group)
cur_dge <- calcNormFactors(cur_dge, method = "TMM")

# create design matrix
design_matrix <- model.matrix(~ design$replicat + group)
design_matrix

## ----GLM2Disp, cache = TRUE----------------------------------------------
cur_dge <- estimateGLMCommonDisp(cur_dge, design_matrix)
cur_dge <- estimateGLMTrendedDisp(cur_dge, design_matrix)
cur_dge <- estimateGLMTagwiseDisp(cur_dge, design_matrix)
plotBCV(cur_dge, main = paste0("BCV plot"))

## ----GLM2Fit-------------------------------------------------------------
# GLM fit
fit <- glmFit(cur_dge, design_matrix)
DEG_GLM2 <- NULL
pvals_GLM2 <- NULL
for (ind in 1:3) {
  res_GLM2 <- glmLRT(fit, coef = ind+3)
  pvals_GLM2 <- c(pvals_GLM2, res_GLM2$table$PValue,
                  p.adjust(res_GLM2$table$PValue, method = "BH"))

  cat("Top 10 DEG for 'wt' vs '", all_conditions[ind], "':\n")
  print(topTags(res_GLM2))
  
  cur_res <- decideTestsDGE(res_GLM2, adjust.method = "BH", p.value = 0.05)
  print(cur_res)
  sel_deg <- which(cur_res[ ,1] != 0)
  cur_res <- cbind(rownames(raw_counts_wn)[sel_deg], cur_res[sel_deg,1],
                   rep(all_conditions[ind], length(sel_deg)))
  DEG_GLM2 <- rbind(DEG_GLM2, cur_res)
}
DEG_GLM2 <- as.data.frame(DEG_GLM2)
names(DEG_GLM2) <- c("name", "UD", "condition")
listDEG_GLM2 <- unique(DEG_GLM2$name)

pvals_GLM2 <- data.frame("pvalue" = pvals_GLM2,
                         "type" = rep(rep(c("raw", "adjusted"),
                                          each = nrow(raw_counts_wn)),
                                      length(all_conditions)),
                         "condition" = rep(all_conditions,
                                           each=nrow(raw_counts_wn)*2))

## ----histGLM2, message = FALSE-------------------------------------------
p <- ggplot(data = pvals_GLM2, aes(x = pvalue, fill = type))
p <- p + geom_histogram()
p <- p + theme_bw()
p <- p + ggtitle("Histogram of raw/adjusted p-values for exact test")
p <- p + facet_grid(type ~ condition) 
p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
               axis.ticks.x = element_blank())
print(p)

## ----resultsGLM2---------------------------------------------------------
table(DEG_GLM2$condition)
table(DEG_GLM2$condition, DEG_GLM2$UD) # 1 means 'up-regulated'
length(listDEG_GLM2)

## ----voom----------------------------------------------------------------
cur_dge <- DGEList(raw_counts_wn)
cur_dge <- calcNormFactors(cur_dge, method = "TMM")
vdge <- voom(cur_dge, design_matrix, plot = TRUE)

## ----voomFit-------------------------------------------------------------
# LM fit
fit <- lmFit(vdge, design_matrix)
fit <- eBayes(fit)
cur_gres <- decideTests(fit, adjust.method = "BH", p.value = 0.05)
head(cur_gres)

## ----voomRes-------------------------------------------------------------
DEG_voom <- NULL
pvals_voom <- NULL
for (ind in 1:3) {
  res_voom <- fit$p.value
  pvals_voom <- c(pvals_voom, res_voom[ ,ind+3],
                  p.adjust(res_voom[ ,ind+3], method = "BH"))

  cat("Top 10 DEG for 'wt' vs '", all_conditions[ind], "':\n")
  print(topTable(fit, coef = ind+3))
  
  sel_deg <- which(cur_gres[ ,ind+3] != 0)
  cur_res <- cbind(rownames(raw_counts_wn)[sel_deg], 
                   cur_gres[sel_deg,ind+3],
                   rep(all_conditions[ind], length(sel_deg)))
  DEG_voom <- rbind(DEG_voom, cur_res)
}
DEG_voom <- as.data.frame(DEG_voom)
names(DEG_voom) <- c("name", "UD", "condition")
listDEG_voom <- unique(DEG_voom$name)

pvals_voom <- data.frame("pvalue" = pvals_voom,
                         "type" = rep(rep(c("raw", "adjusted"),
                                          each = nrow(raw_counts_wn)),
                                      length(all_conditions)),
                         "condition" = rep(all_conditions,
                                           each=nrow(raw_counts_wn)*2))

## ----histVoom, message = FALSE-------------------------------------------
p <- ggplot(data = pvals_voom, aes(x = pvalue, fill = type))
p <- p + geom_histogram()
p <- p + theme_bw()
p <- p + ggtitle("Histogram of raw/adjusted p-values for exact test")
p <- p + facet_grid(type ~ condition) 
p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), 
               axis.ticks.x = element_blank())
print(p)

## ----resultsVoom---------------------------------------------------------
table(DEG_voom$condition)
table(DEG_voom$condition, DEG_voom$UD) # 1 means 'up-regulated'
length(listDEG_voom)

## ----venn----------------------------------------------------------------
vd <- venn.diagram(x=list("Exact test" = listDEG_pairwiseExact,
                          "GLM\n group" = listDEG_GLM1, 
                          "GLM\n group + replicate" = listDEG_GLM2,
                          "voom" = listDEG_voom), 
                   fill = brewer.pal(4, "Set3"), 
                   cat.col = c("darkgreen", "black", "darkblue", "darkred"),
                   cat.cex = 1.5, fontface="bold", filename=NULL)
grid.draw(vd)

## ----saveVDPlot, echo=FALSE, fig.show='hide', message=FALSE, results='hide'----
png(file = "../images/venn_deg.png", width=600, height=600, res=150)
grid.draw(vd)
dev.off()
system("rm VennDiagram*.log")

## ----sessionInfo---------------------------------------------------------
session_info()

