EXMD635 Lecture R Code

Hi everyone, on this page you will find the code used to generate the figures in the “Introduction to Bioinformatics” EXMD 635 course lecture at McGill University on October 3rd, 2016. The code is provided for demonstration purposes. Please feel free to consult the blog section of this webpage for more detailed tutorials.

library("affy")

# Read the .CEL files and creates an AffyBatch object
myData <- ReadAffy(celfile.path="C:/Users/Radia/Desktop/HER2 project desktop/GSE22093_RAW")

# Checking the quality of the data before normalization
library("arrayQualityMetrics")

# For quality metrics prior to normalization
arrayQualityMetrics(myData, outdir="quality_results")

# Data normalization--------------------------------
# Unormalized data
par(mar=c(12,5,2,2))
boxplot(myData[, 1:6], las=2, ylab="Signal Intensity")

# Quantile Normalization
myData.quantile <- normalize.AffyBatch.quantiles(myData)
boxplot(myData.quantile[, 1:6], las=2, ylab="Signal Intensity")

# Loess Normalization
myData.loess <- normalize.AffyBatch.loess(myData)
boxplot(myData.loess[, 1:6], las=2, ylab="Signal Intensity")

# RMA normalization--------------------------------
# Creates an eset object from the AffyBatch object
eset <- rma(myData)
boxplot(exprs(eset), las=2, ylab="Expression Values")

# Checking the quality of the data after normalization
arrayQualityMetrics(expressionset=eset, outdir="Report_for_NormGSE22093", force=TRUE)

#----------------------------------------------------------------
# Add pData to the eset object for downstream analysis
setwd("C:/Users/Radia/Desktop/HER2 project desktop")
load(file="GSE22093example.RData") # loads eset object

# Correct the colnames of the eset object
colnames(eset)
newNames <- gsub(".CEL.gz", "", colnames(eset))
colnames(eset) <- newNames

# Add the phonoData to the eset object
phenoData <- readRDS("HER2phenoData.rds")
head(phenoData)

pData(eset) <- phenoData

head(phenoData[, c(3:6, 10:11)])
tail(phenoData[, c(3:6, 10:11)])

saveRDS(eset, "eset.rds")
#----------------------------------------------------------------
# pData features for downstream analysis
# Simplify the names of our features of interest
p53status <- sapply(strsplit(as.character(phenoData$characteristics_ch1.3), ": "), function(x) x[[2]])
ER_mRNA <- sapply(strsplit(as.character(phenoData$characteristics_ch1.4), "): "), function(x) x[[2]])
ER_IHC <- sapply(strsplit(as.character(phenoData$characteristics_ch1.5), ": "), function(x) x[[2]])
grade <- sapply(strsplit(as.character(phenoData$characteristics_ch1.10), ": "), function(x) x[[2]])
qc <- sapply(strsplit(as.character(phenoData$characteristics_ch1.2), ": "), function(x) x[[2]])

# Create a data frame with the information
data <- data.frame(p53status, ER_mRNA, ER_IHC, grade, qc)

# Adjust the rownames
rownames(data) <- rownames(phenoData)

# Show the first 20 samples
head(data, 20)

#----------------------------------------------------------------
# Going from probe ids to gene names for esest
eset #Annotation: hgu133a

# Load annotation library
library("hgu133a.db")

probeids <- rownames(exprs(eset))
geneSymbols <- unlist(mget(probeids, hgu133aSYMBOL, ifnotfound=NA))
geneEntrez <- unlist(mget(probeids, hgu133aENTREZID, ifnotfound=NA))

excelData <- data.frame(exprs(eset), geneSymbols, geneEntrez, stringsAsFactors=FALSE)

head(excelData)
write.csv(excelData, file="HER2normalizedData.csv")

#----------------------------------------------------------------

# Filter gene expression data
library("genefilter")

# Non-specific filtering
f1 <- pOverA(0.25, log2(100)) # the intensity of a gene should be above log2(100) in at least 25 percent of the samples
f2 <- function(x) (IQR(x) > 0.5)
ff <- filterfun(f1, f2)
selected <- genefilter(eset, ff)
sum(selected) # 5235 probes remain
esetSub <- eset[selected, ]

#----------------------------------------------------------------
# For Hierarchical Clustering on unfiltered data
dat <- exprs(eset)
d <- dist(t(dat), method = "euclidean")
image(as.matrix(d))
hc <- hclust(d, method = "ward")
plot(hc)
# plot(hc, labels = ER_mRNA)

# Highlight samples by ER status
pal <- c("purple", "darkgreen", "gray")
grp <- factor(ER_IHC)

# Add the grp color coding
library(dendextend)
dend <- as.dendrogram(hc)
labels_colors(dend) <- pal[grp][order.dendrogram(dend)]
plot(dend)
legend("topright", legend=levels(grp), fill=pal, cex=1.5)

#----------------------------------------------------------------
# For Hierarchical Clustering on filtered data

dat <- exprs(esetSub)
d <- dist(t(dat), method = "euclidean")
image(as.matrix(d))
hc <- hclust(d, method = "ward")
plot(hc)

dend <- as.dendrogram(hc)
labels_colors(dend) <- pal[grp][order.dendrogram(dend)]
plot(dend)
legend("topright", legend=levels(grp), fill=pal, cex=1.5)

#----------------------------------------------------------------
# For Hierarchical Clustering on Top 100 most variable genes
dat <- exprs(eset)
genes.var <- apply(dat, 1, var)
genes.var.select <- order(genes.var, decreasing = T)[1:100]
dat.s <- dat[genes.var.select, ]
d.s <- dist(t(dat.s), method = "euclidean")
hc.s <- hclust(d.s, method = "ward")
plot(hc.s)

dend <- as.dendrogram(hc.s)
labels_colors(dend) <- pal[grp][order.dendrogram(dend)]
plot(dend)
legend("topright", legend=levels(grp), fill=pal, cex=1.5)

#----------------------------------------------------------------
# For Hierarchical Clustering on Top 100 most variable genes
library("ComplexHeatmap")
library("dendextend")

dat <- exprs(eset)
genes.var <- apply(dat, 1, IQR)
genes.var.select <- order(genes.var, decreasing = T)[1:100]
dat.s <- dat[genes.var.select, ]
d.s <- dist(t(dat.s), method = "euclidean")
hc.s <- hclust(d.s, method = "ward")
plot(hc.s)

dend <- as.dendrogram(hc.s)
labels_colors(dend) <- pal[grp][order.dendrogram(dend)]
plot(dend)
legend("topright", legend=levels(grp), fill=pal, cex=1.5)

# Heatmap visualization of Top 100 most variable genes by IQR
mat <- dat.s
dend = hclust(dist(mat, method = "euclidean"), method="ward.D2")
dend = color_branches(dend, k = 3)
Heatmap(mat, name = "log2 expression", cluster_rows = dend, row_names_gp=gpar(fontsize=6), column_names_gp=gpar(fontsize=6))

#----------------------------------------------------------------
# Heatmap visualization of Top 100 most variable genes by IQR
# with pData information
genes.var <- apply(dat, 1, IQR)
genes.var.select <- order(genes.var, decreasing = T)[1:100]
dat.s <- dat[genes.var.select, ]
z <- dat.s
mat <- t(scale(t(z)))

data <- data.frame(p53status, ER_mRNA, ER_IHC, grade, qc)
rownames(data) <- rownames(phenoData)

ha = HeatmapAnnotation(df = data[, 1:4],
col = list(p53status = c("MUT" = "brown1", "WT" = "black", "n/a" = "gray"),
ER_mRNA = c("ERneg" = "black", "ERpos" = "darkorchid1"),
ER_IHC= c("ERneg" = "black", "ERpos" = "darkorchid4", "NA" ="white"),
grade = c("1" = "darkblue", "2" = "darkmagenta", "3" = "darkgreen", "NA"="white")))

Heatmap(mat, name = "scaled values", cluster_rows = dend, row_names_gp=gpar(fontsize=6), column_names_gp=gpar(fontsize=9), top_annotation=ha)

#----------------------------------------------------------------
# Gene and sample distance DATA VISUALISATION
library("bioDist")

dat <- exprs(eset)
genes.var <- apply(dat, 1, IQR)
genes.var.select <- order(genes.var, decreasing = T)[1:100]
eS <- esetSub[genes.var.select, ]

# To get the closest related genes
sp = spearman.dist(eS, sample=FALSE)
f1 = featureNames(eS)[1]
closest.top(f1, sp, 3)

# For sample-distance heatmaps
library("RColorBrewer")
hmcol <- colorRampPalette(brewer.pal(10,"RdBu"))(256)
pal <- c("purple", "darkgreen", "gray")
grp <- factor(ER_mRNA)

small.eset <- exprs(eS)
d.sample.euc <- euc(t(small.eset))
heatmap(as.matrix(d.sample.euc),sym=TRUE,col=hmcol,
main='Between-sample distances (Euclidean)',labCol=NA,
xlab='cell type', cexRow=0.5, ColSideColors=pal[grp])

# For gene-distance heatmaps
d.gene.cor <- cor.dist(small.eset)
heatmap(as.matrix(d.gene.cor),sym=TRUE,col=hmcol,
main='Between-gene distances (Pearson)',
xlab='probe set id',labCol=NA, cexRow=0.5)

#----------------------------------------------------------------
# Get the gene names for the related genes in the gene-distance plot
M1 <- as.matrix(d.gene.cor)
head(M1)

library("hgu133a.db")
probeids <- rownames(M1)
geneSymbols <- unlist(mget(probeids, hgu133aSYMBOL, ifnotfound=NA))

getNewNames <- function(x) ifelse(is.na(x), names(x), x)
rownames(M1) <- getNewNames(geneSymbols)

heatmap(M1,sym=TRUE,col=hmcol,
main='Between-gene distances (Pearson)',
xlab='probe set id',labCol=NA, cexRow=0.5)

#----------------------------------------------------------------
# Get the list of DEG gene betweeen grade3 and other cancers
# Run DEG with limma
library("limma")

# Define the groups for differential expression
grade3status <- function(x) ifelse(x=="3", "grade 3", "other")
grp <- factor(grade3status(grade), levels=c("other", "grade 3"))

# Get filtered expression data
M1 <- exprs(esetSub)
dim(M1) # 5284 x 103

# define model matrix for lmFit
design <- model.matrix(~grp)
head(design)

# Fit models
fit <- lmFit(M1, design)
fit <- eBayes(fit)

res <- topTable(fit, coef="grpgrade 3", n="Inf", adjust.method="BH")
dim(res)
head(res)

# Add gene names
require("hgu133a.db")
probeids <- rownames(res)
geneSymbols <- unlist(mget(probeids, hgu133aSYMBOL, ifnotfound=NA))
gene_symbols <- getNewNames(geneSymbols)
resPrint <- cbind(res, gene_symbols)

# Save the table as a txt file
write.table(resPrint, "DEG_Her2_Grade3vsOthers.txt", row.names=TRUE, col.names=NA, quote=FALSE, sep="\t")

# To just print the probes that were significant by adjusted p value
resB <- topTable(fit, coef="grpgrade 3", n="Inf", adjust.method="BH", p.value=0.05)
dim(resB) # 2578 x 6

#----------------------------------------------------------------
# Heatmap for the DEG genes between Grade 3 HER2+ tumors and Other types
library("gplots")
grp <- factor(grade)
grp <- relevel(grp, ref="3")
pal <- c("darkgreen", "cyan", "darkblue", "gray")

# Heatmap with scaled expression values
heatmap(M2,sym=FALSE,col=hmcol,
ColSideColors=pal[grp], xlab='HER2+ BC Samples', labCol=NA)

# or with heatmap.2() function
# Print Heatmap with the legend
heatmap.2(M2,scale="row",col=hmcol,
ColSideColors=pal[grp], xlab='HER2+ BC Samples', labCol=NA, trace="none", key.title="")
legend("topright", legend=levels(grp), fill=pal, cex=0.6, title="Grade")

#----------------------------------------------------------------
# Example DESeq2 script
### Do not run ###
# Get the sample table to run the analysis for all samples --------------------------------------------
sampleTable <- data.frame(sampleName=sampleNames, fileName=sampleFiles, condition=sampleCondition)
sampleTable

# Get the desq2 data object
ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, directory=directory, design=~condition)

# Save a table of the counts data
countsTable <- counts(ddsHTSeq)
head(countsTable)

dds <- DESeq(ddsHTSeq)

res <- results(dds)

summary(res)

# Doing the normalization after filtering genes that are not expressed across the samples
dds1 <- dds[rowSums(counts(dds)) > 1, ]
rld1 <- rlogTransformation(dds1, blind=TRUE) # local regression fit was used to transform the data
head(assay(rld1))

# Run the second part of the analysis
dds <- DESeq(dds1)
res<-results(dds)
summary(res)

res<-res[order(res$padj),]

# Plot MA plot with DEG genes highlighted in red
plotMA(res)

# Save the results as a csv file
write.csv(as.data.frame(res), file=paste(fname, ".csv", sep=""))