Methods
Identify Differentially expressed genes (Group with high expression of BRCA1 vs low)
GSEA
- GSEA, main KEGG, Reactome, Biocarta, WikiPathways pathways
  - examples
- GSEA, GO

## Run txi import
#library(tximport)
library(stringr)
#library(colorspace)
library(DESeq2)
library(ggfortify)
#library(sva)
#library(pcaExplorer)
library(extrafont)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(topGO)
library(DT)
library(dplyr)
library(readr)
library(gplots)
library(RColorBrewer)
library(viridis)
library(ggpubr)
library(reshape)
library(ggrepel)
library(data.table)
library(knitr)
library(apeglm)
library(gridExtra)
library("pheatmap")
library(grid)     ## Need to attach (and not just load) grid package
library(geneplotter)
library(tibble)
library(threejs)
library(EnsDb.Hsapiens.v86)
library(rhdf5)
library(kableExtra)
library(openxlsx)
library(clusterProfiler)
library(magrittr)
library("pathview")
library(DOSE)
#library(EnsDb.Hsapiens.v75)

library(scales)
library(msigdbr)
library(fgsea)
library(tidyr)
library(cowplot)
#library(ComplexHeatmap)

#library(UpSetR)
#library(circlize)

library(enrichplot)

#setwd("F:/research/Sanyi")
tableGrob2 <- function(d, p = NULL) {
    # has_package("gridExtra")
    d <- d[order(rownames(d)),]
    tp <- gridExtra::tableGrob(d)
    if (is.null(p)) {
        return(tp)
    }

    # Fix bug: The 'group' order of lines and dots/path is different
    p_data <- ggplot_build(p)$data[[1]]
    # pcol <- unique(ggplot_build(p)$data[[1]][["colour"]])
    p_data <- p_data[order(p_data[["group"]]), ]
    pcol <- unique(p_data[["colour"]])
    ## This is fine too
    ## pcol <- unique(p_data[["colour"]])[unique(p_data[["group"]])]  
    j <- which(tp$layout$name == "rowhead-fg")

    for (i in seq_along(pcol)) {
        tp$grobs[j][[i+1]][["gp"]] <- gpar(col = pcol[i])
    }
    return(tp)
}

gseaplot2_v2 = function (x, geneSetID, title = "", color = "green", base_size = 11, 
  rel_heights = c(1.5, 0.5, 1), subplots = 1:3, pvalue_table = FALSE, 
  ES_geom = "line", ylim_topplot=NULL, addhline_topplot=FALSE) 
{
  ES_geom <- match.arg(ES_geom, c("line", "dot"))
  geneList <- position <- NULL
  if (length(geneSetID) == 1) {
    gsdata <- enrichplot:::gsInfo(x, geneSetID)
  } else {
    gsdata <- do.call(rbind, lapply(geneSetID, enrichplot:::gsInfo, object = x))
  }
  p <- ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) + theme_classic(base_size) + 
    theme(panel.grid.major = element_line(colour = "grey92"), 
      panel.grid.minor = element_line(colour = "grey92"), 
      panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) + 
    scale_x_continuous(expand = c(0, 0))
  if (ES_geom == "line") {
    es_layer <- geom_line(aes_(y = ~runningScore, color = ~Description), 
      size = 1)
  }
  else {
    es_layer <- geom_point(aes_(y = ~runningScore, color = ~Description), 
      size = 1, data = subset(gsdata, position == 1))
  }
  p.res <- p + es_layer + theme(legend.position = c(0.8, 0.8), 
    legend.title = element_blank(), legend.background = element_rect(fill = "transparent"))
  p.res <- p.res + ylab("Running Enrichment Score") + theme(axis.text.x = element_blank(), 
    axis.ticks.x = element_blank(), axis.line.x = element_blank(), 
    plot.margin = margin(t = 0.2, r = 0.2, b = 0, l = 0.2, 
      unit = "cm"))
      if(!is.null(addhline_topplot)){
      p.res <- p.res + geom_hline(yintercept=0, colour="grey40")  
      }
      if(!is.null(ylim_topplot)){
            p.res <- p.res + coord_cartesian(ylim = ylim_topplot)
            }
  i <- 0
  for (term in unique(gsdata$Description)) {
    idx <- which(gsdata$ymin != 0 & gsdata$Description == 
      term)
    gsdata[idx, "ymin"] <- i
    gsdata[idx, "ymax"] <- i + 1
    i <- i + 1
  }
  p2 <- ggplot(gsdata, aes_(x = ~x)) + geom_linerange(aes_(ymin = ~ymin, 
    ymax = ~ymax, color = ~Description)) + xlab(NULL) + 
    ylab(NULL) + theme_classic(base_size) + theme(legend.position = "none", 
    plot.margin = margin(t = -0.1, b = 0, unit = "cm"), 
    axis.ticks = element_blank(), axis.text = element_blank(), 
    axis.line.x = element_blank()) + scale_x_continuous(expand = c(0, 
    0)) + scale_y_continuous(expand = c(0, 0))
  if (length(geneSetID) == 1) {
    v <- seq(1, sum(gsdata$position), length.out = 9)
    inv <- findInterval(rev(cumsum(gsdata$position)), v)
    if (min(inv) == 0) 
      inv <- inv + 1
    col = c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))
    ymin <- min(p2$data$ymin)
    yy <- max(p2$data$ymax - p2$data$ymin) * 0.3
    xmin <- which(!duplicated(inv))
    xmax <- xmin + as.numeric(table(inv)[unique(inv)])
    d <- data.frame(ymin = ymin, ymax = yy, xmin = xmin, 
      xmax = xmax, col = col[unique(inv)])
    p2 <- p2 + geom_rect(aes_(xmin = ~xmin, xmax = ~xmax, 
      ymin = ~ymin, ymax = ~ymax, fill = ~I(col)), data = d, 
      alpha = 0.9, inherit.aes = FALSE)
  }
  df2 <- p$data
  df2$y <- p$data$geneList[df2$x]
  p.pos <- p + geom_segment(data = df2, aes_(x = ~x, xend = ~x, 
    y = ~y, yend = 0), color = "grey")
  p.pos <- p.pos + ylab("Ranked list metric") + xlab("Rank in Ordered Dataset") + 
    theme(plot.margin = margin(t = -0.1, r = 0.2, b = 0.2, 
      l = 0.2, unit = "cm"))
  if (!is.null(title) && !is.na(title) && title != "") 
    p.res <- p.res + ggtitle(title)
  if (length(color) == length(geneSetID)) {
    p.res <- p.res + scale_color_manual(values = color)
    if (length(color) == 1) {
      p.res <- p.res + theme(legend.position = "none")
      p2 <- p2 + scale_color_manual(values = "black")
    } else {
      p2 <- p2 + scale_color_manual(values = color)
    }
  }
  if (pvalue_table) {
    # pd <- x[geneSetID, c("Description", "pvalue", "p.adjust")]
    # pd <- pd[order(pd[, 1], decreasing = FALSE), ]
    # rownames(pd) <- pd$Description
    # pd <- pd[, -1]
    pd <- x[geneSetID, c("enrichmentScore", "NES", "pvalue", "p.adjust")]
    rownames(pd) <- ""
    # pd <- pd[, ]
    pd[,1:2] <- round(pd[,1:2],2)
    pd[,3:4] <- round(pd[,3:4],3)
    colnames(pd)[1] <- "ES"

    pd <- round(pd, 4)
    tp <- tableGrob2(pd, p.res)
    p.res <- p.res + theme(legend.position = "none") + annotation_custom(tp, 
      xmin = quantile(p.res$data$x, 0.5), xmax = quantile(p.res$data$x, 
        0.95), ymin = quantile(p.res$data$runningScore, 
        0.75), ymax = quantile(p.res$data$runningScore, 
        0.9))
  }
  plotlist <- list(p.res, p2, p.pos)[subplots]
  n <- length(plotlist)
  plotlist[[n]] <- plotlist[[n]] + theme(axis.line.x = element_line(), 
    axis.ticks.x = element_line(), axis.text.x = element_text())
  if (length(subplots) == 1) 
    return(plotlist[[1]] + theme(plot.margin = margin(t = 0.2, 
      r = 0.2, b = 0.2, l = 0.2, unit = "cm")))
  if (length(rel_heights) > length(subplots)) 
    rel_heights <- rel_heights[subplots]
  plot_grid(plotlist = plotlist, ncol = 1, align = "v", rel_heights = rel_heights)
}

Methods

cancers <- c("ACC","BLCA","BRCA","CESC","CHOL", "COAD","DLBC", "ESCA", "GBM", "HNSC", "KICH","KIRC","KIRP", "LGG", "LIHC", "LUAD", "LUSC", "MESO","OV", "PAAD", "PCPG", "PRAD", "READ","SARC", "SKCM","STAD","TGCT", "THCA", "THYM", "UCEC","UCS")

mycancer = params$selected_cancers_dge
load(paste0("./data/TCGA_",gsub("-","_", mycancer), "_RNASeq_STAR_primary.RData"))
rnaseq<-assay(data, "unstranded")
rnaseq <- rnaseq[,order(colnames(rnaseq))]
colnames(rnaseq)<-substr(colnames(rnaseq),1,12)
rnaseq<-rnaseq[,!duplicated(colnames(rnaseq))]
rnaseq <- as.matrix(rnaseq)
trnaseq <- as.data.frame(t(rnaseq))
trnaseq$PatientID <- rownames(trnaseq)
colnames(trnaseq) = gsub("\\..*","",colnames(trnaseq))
merged <- allclinical[allclinical$disease==mycancer,]
merged = merge(merged, trnaseq,by.x="submitter_id",by.y="PatientID")

ensembl_id <- mapIds(org.Hs.eg.db, keys = params$selected_gene,
                           column = "ENSEMBL", keytype = "SYMBOL", multiVals = "first")
colnames(merged)[colnames(merged) == ensembl_id] <- params$selected_gene

ensembl2genes <- as.data.frame(genes(EnsDb.Hsapiens.v86,return.type="DataFrame"))
ensembl2genes$entrezid_v2=mapIds(org.Hs.eg.db,
                    keys=ensembl2genes$gene_id,
                    column="ENTREZID",
                    keytype="ENSEMBL",
                    multiVals="first")

ensembl2genes_unique = ensembl2genes[(!is.na(ensembl2genes$entrezid_v2) & (ensembl2genes$seq_name %in% c(1:22,"X","Y","MT"))),]
ensembl2genes_unique = ensembl2genes_unique[ensembl2genes_unique$gene_id %in% gsub("\\..*","",colnames(trnaseq)),]
ensembl2genes_unique$entrezid_v2 = ifelse(ensembl2genes_unique$gene_id %in% c("ENSG00000177257","ENSG00000186599","ENSG00000198129","ENSG00000226023","ENSG00000283154","ENSG00000236125","ENSG00000111215","ENSG00000107014","ENSG00000278662"), ensembl2genes_unique$entrezid, ensembl2genes_unique$entrezid_v2)
ensembl2genes_unique = ensembl2genes_unique[!(ensembl2genes_unique$gene_id %in% c("ENSG00000261832", "ENSG00000263715", "ENSG00000258653", "ENSG00000283563", "ENSG00000273167", "ENSG00000268861", "ENSG00000267645", "ENSG00000261130", "ENSG00000256966", "ENSG00000185040", "ENSG00000269226", "ENSG00000237541", "ENSG00000257046", "ENSG00000249624", "ENSG00000174353", "ENSG00000255330", "ENSG00000178934", "ENSG00000283201", "ENSG00000228696", "ENSG00000275740", "ENSG00000258724", "ENSG00000183336", "ENSG00000177243", "ENSG00000269845", "ENSG00000255508", "ENSG00000205571", "ENSG00000248472", "ENSG00000233614", "ENSG00000215190", "ENSG00000274512", "ENSG00000205572", "ENSG00000260548", "ENSG00000272916", "ENSG00000260128", "ENSG00000258790", "ENSG00000280987")), ]
ensembl2genes_unique = ensembl2genes_unique[ensembl2genes_unique$gene_biotype %in% c("IG_C_gene", "IG_D_gene", "IG_J_gene", "IG_V_gene", "protein_coding", "ribozyme", "TR_C_gene", "TR_D_gene", "TR_J_gene", "TR_V_gene"),]

msigdbr_df01 <- msigdbr(species = "Homo sapiens", category = "H")
msigdbr_df02 <- msigdbr(species = "Homo sapiens", category = "C2")
msigdbr_df = rbind(msigdbr_df01, msigdbr_df02)
msigdbr_df = msigdbr_df[grepl("^HALLMARK|^KEGG|^BIOCARTA|^REACTOME|^WP", msigdbr_df$gs_name),]
msigdbr_df_2columns=msigdbr_df[,c("gs_name", "entrez_gene")]
pathwaysHandC2 = split(x = msigdbr_df$entrez_gene, f = msigdbr_df$gs_name)

msigdbr_df01 <- msigdbr(species = "Homo sapiens", category = "H")
msigdbr_df02 <- msigdbr(species = "Homo sapiens", category = "C2")
msigdbr_df = rbind(msigdbr_df01, msigdbr_df02)
msigdbr_df_other = msigdbr_df[!grepl("^HALLMARK|^KEGG|^BIOCARTA|^REACTOME|^WP", msigdbr_df$gs_name),]
msigdbr_df_2columns_other=msigdbr_df_other[,c("gs_name", "entrez_gene")]

msigdbr_df_GO <- msigdbr(species = "Homo sapiens", category = "C5")
msigdbr_df_2columns_GO=msigdbr_df_GO[,c("gs_name", "entrez_gene")]

Identify Differentially expressed genes (Group with high expression of BRCA1 vs low)

limit1=params$selected_percentage/100
limit2=(100-params$selected_percentage)/100
subset =merged
subset$status = factor(ifelse(subset[,params$selected_gene]<quantile(subset[,params$selected_gene],c(0,limit1, limit2,1))[2],"low" ,ifelse(subset[,params$selected_gene]>quantile(subset[,params$selected_gene],c(0,limit1, limit2,1))[3],"high", NA)), levels=c("low", "high"))
subset = subset[!is.na(subset$status),]
dds <- DESeqDataSetFromMatrix(countData = as.matrix(t(subset[,gsub("\\..*","",colnames(subset)) %in% ensembl2genes_unique$gene_id])),
                              colData = subset[,c(1:58, ncol(subset))],
                              design= ~ status)
dds_de <- DESeq(dds)
#res_HighvsLow <- results(dds_de, contrast=c("status","high", "low"), alpha=0.05)
res_HighvsLow <- lfcShrink(dds=dds_de, coef="status_high_vs_low", type="apeglm")
res_HighvsLow_df = as.data.frame(res_HighvsLow)
res_HighvsLow_df = merge(res_HighvsLow_df,ensembl2genes_unique, by.x="row.names", by.y="gene_id")
res_HighvsLow_df = as.data.frame(res_HighvsLow_df[order(res_HighvsLow_df$padj),])

Interactive Volcano plot

This figure is only available in the downloaded report. To view it:
1. Download the report (HTML file)
2. Open the file on your computer
Once you have done this, all figures and tables will be visible.
If you hover over the data points on this plot with your mouse, more details will be revealed, including the gene name.

#```{r,warning=F, message=F,fig.show = 'hold', out.width = c('50%','50%'), eval = TRUE}

library(plotly)

# Assuming your data frame is called 'res_HighvsLow_df '
# If not, replace 'res_HighvsLow_df ' with your actual data frame name

# Create the ggplot object for the volcano plot
volcano_plot <- ggplot(res_HighvsLow_df , aes(x = log2FoldChange, y = -log10(padj))) +
  geom_point(aes(color = ifelse(padj < 0.05 & abs(log2FoldChange) > 1, "Significant", "Not Significant"),
                 text = paste("Gene:", gene_name,
                              "<br>log2FoldChange:", round(log2FoldChange, 2),
                              "<br>padj:", signif(padj, 3),
                              "<br>Gene Biotype:", gene_biotype)),
             alpha = 0.6) +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "gray")) +
  theme_minimal() +
  labs(title = "Interactive Volcano Plot",
       x = "log2 Fold Change",
       y = "-log10 Adjusted p-value") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "blue") +
  theme(legend.position = "none")  # Remove the legend

# Convert ggplot to an interactive plotly object
interactive_volcano <- ggplotly(volcano_plot, tooltip = "text")

# Customize the layout
interactive_volcano <- interactive_volcano %>%
  layout(hoverlabel = list(bgcolor = "white"),
         annotations = list(
           list(x = 1.02, y = 1.05,
                text = "Click and drag to zoom<br>Double-click to reset",
                xref = "paper", yref = "paper",
                showarrow = FALSE)
         ))

# Display the interactive plot
interactive_volcano

Not-interactive Volcano plot

volcano_plot <- ggplot(res_HighvsLow_df, aes(x = log2FoldChange, y = -log10(padj))) +
  geom_point(aes(color = ifelse(padj < 0.05 & abs(log2FoldChange) > 1, "Significant", "Not Significant"), 
                 size = baseMean), alpha = 0.6) +
  scale_color_manual(values = c("Significant" = "red", "Not Significant" = "gray")) +
  scale_size_continuous(range = c(1, 5)) +
  theme_minimal() +
  labs(title = "Enhanced Volcano Plot",
       x = "log2 Fold Change",
       y = "-log10 Adjusted p-value",
       color = "Significance",
       size = "Base Mean") +
  geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "blue") +
  geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "blue") +
  geom_text_repel(data = subset(res_HighvsLow_df, padj < 0.01 & abs(log2FoldChange) > 2),
                  aes(label = gene_name), max.overlaps = 20)
print(volcano_plot)

Results of DEG (Group with high expression of BRCA1 vs low)

datatable(
  res_HighvsLow_df[res_HighvsLow_df$padj < 0.05, 
    c("symbol", "baseMean", "log2FoldChange", "lfcSE", "pvalue", "padj")],
  caption = '',
  filter = 'top',
  options = list(
    pageLength = 10, 
    autoWidth = TRUE,
    dom = 'Bfrtip',
    buttons = c('csv', 'excel')
  ),
  extensions = 'Buttons',
  class = "compact",
  rownames = FALSE
)

This table is only available in the downloaded report. To view it:
1. Download the report (HTML file)
2. Open the file on your computer
Once you have done this, all figures and tables will be visible.

GSEA

GSEA, main KEGG, Reactome, Biocarta, WikiPathways pathways

geneList = res_HighvsLow_df$log2FoldChange
names(geneList) = res_HighvsLow_df$entrezid_v2
geneList<-na.omit(geneList)
geneList <- sort(geneList, decreasing = TRUE)

#length(geneList)
geneList=geneList[!duplicated(names(geneList))]
#length(geneList)

GSEAres <- GSEA(geneList, TERM2GENE = msigdbr_df_2columns, minGSSize = 10, pvalueCutoff = 1)
mypathways_table=GSEAres[order(GSEAres$p.adjust),]
mypathways=mypathways_table[mypathways_table$p.adjust<0.05,"ID"]

datatable(mypathways_table[mypathways_table$p.adjust<0.05,c("ID", "enrichmentScore", "NES", "pvalue", "p.adjust")],   caption = paste0('Pathways, p.adjust<0.05'), filter = 'top', options = list(
    pageLength = 10, 
    autoWidth = TRUE,
    dom = 'Bfrtip',
    buttons = c('csv', 'excel')
  ),
  extensions = 'Buttons',
  class = "compact",
  rownames = FALSE
)

GSEAres_core = GSEAres

examples

if(length(mypathways)>0){
nocomment = sapply(mypathways[1:min(length(mypathways),10)], function(x) { p1 = gseaplot2_v2(GSEAres, geneSetID = x, title = paste0(str_to_title(x), ""), ylim_topplot=NULL, pvalue_table = T, addhline_topplot=T);print(p1)})
}

GSEA, GO

GSEAres <- GSEA(geneList, TERM2GENE = msigdbr_df_2columns_GO, minGSSize = 10, pvalueCutoff = 1)
mypathways_table=GSEAres[order(GSEAres$p.adjust),]
mypathways=mypathways_table[mypathways_table$p.adjust<0.05,"ID"]

datatable(mypathways_table[mypathways_table$p.adjust<0.05,c("ID", "enrichmentScore", "NES", "pvalue", "p.adjust")],   caption = paste0('Pathways, p.adjust<0.05'), filter = 'top', options = list(
    pageLength = 10, 
    autoWidth = TRUE,
    dom = 'Bfrtip',
    buttons = c('csv', 'excel')
  ),
  extensions = 'Buttons',
  class = "compact",
  rownames = FALSE
)

GSEAres_go= GSEAres

TCGA, differential expression, GSEA

Zsofia Sztupinszki, zmsz@cancer.dk, zsofia.sztupinszki@childrens.harvard.edu