Hi everyone, on this page you will find the code used to generate the figures in the “Introduction to Bioinformatics” EXMD 635 course lecture at McGill University on October 3rd, 2016. The code is provided for demonstration purposes. Please feel free to consult the blog section of this webpage for more detailed tutorials.
library("affy") # Read the .CEL files and creates an AffyBatch object myData <- ReadAffy(celfile.path="C:/Users/Radia/Desktop/HER2 project desktop/GSE22093_RAW") # Checking the quality of the data before normalization library("arrayQualityMetrics") # For quality metrics prior to normalization arrayQualityMetrics(myData, outdir="quality_results") # Data normalization-------------------------------- # Unormalized data par(mar=c(12,5,2,2)) boxplot(myData[, 1:6], las=2, ylab="Signal Intensity") # Quantile Normalization myData.quantile <- normalize.AffyBatch.quantiles(myData) boxplot(myData.quantile[, 1:6], las=2, ylab="Signal Intensity") # Loess Normalization myData.loess <- normalize.AffyBatch.loess(myData) boxplot(myData.loess[, 1:6], las=2, ylab="Signal Intensity") # RMA normalization-------------------------------- # Creates an eset object from the AffyBatch object eset <- rma(myData) boxplot(exprs(eset), las=2, ylab="Expression Values") # Checking the quality of the data after normalization arrayQualityMetrics(expressionset=eset, outdir="Report_for_NormGSE22093", force=TRUE) #---------------------------------------------------------------- # Add pData to the eset object for downstream analysis setwd("C:/Users/Radia/Desktop/HER2 project desktop") load(file="GSE22093example.RData") # loads eset object # Correct the colnames of the eset object colnames(eset) newNames <- gsub(".CEL.gz", "", colnames(eset)) colnames(eset) <- newNames # Add the phonoData to the eset object phenoData <- readRDS("HER2phenoData.rds") head(phenoData) pData(eset) <- phenoData head(phenoData[, c(3:6, 10:11)]) tail(phenoData[, c(3:6, 10:11)]) saveRDS(eset, "eset.rds") #---------------------------------------------------------------- # pData features for downstream analysis # Simplify the names of our features of interest p53status <- sapply(strsplit(as.character(phenoData$characteristics_ch1.3), ": "), function(x) x[[2]]) ER_mRNA <- sapply(strsplit(as.character(phenoData$characteristics_ch1.4), "): "), function(x) x[[2]]) ER_IHC <- sapply(strsplit(as.character(phenoData$characteristics_ch1.5), ": "), function(x) x[[2]]) grade <- sapply(strsplit(as.character(phenoData$characteristics_ch1.10), ": "), function(x) x[[2]]) qc <- sapply(strsplit(as.character(phenoData$characteristics_ch1.2), ": "), function(x) x[[2]]) # Create a data frame with the information data <- data.frame(p53status, ER_mRNA, ER_IHC, grade, qc) # Adjust the rownames rownames(data) <- rownames(phenoData) # Show the first 20 samples head(data, 20) #---------------------------------------------------------------- # Going from probe ids to gene names for esest eset #Annotation: hgu133a # Load annotation library library("hgu133a.db") probeids <- rownames(exprs(eset)) geneSymbols <- unlist(mget(probeids, hgu133aSYMBOL, ifnotfound=NA)) geneEntrez <- unlist(mget(probeids, hgu133aENTREZID, ifnotfound=NA)) excelData <- data.frame(exprs(eset), geneSymbols, geneEntrez, stringsAsFactors=FALSE) head(excelData) write.csv(excelData, file="HER2normalizedData.csv") #---------------------------------------------------------------- # Filter gene expression data library("genefilter") # Non-specific filtering f1 <- pOverA(0.25, log2(100)) # the intensity of a gene should be above log2(100) in at least 25 percent of the samples f2 <- function(x) (IQR(x) > 0.5) ff <- filterfun(f1, f2) selected <- genefilter(eset, ff) sum(selected) # 5235 probes remain esetSub <- eset[selected, ] #---------------------------------------------------------------- # For Hierarchical Clustering on unfiltered data dat <- exprs(eset) d <- dist(t(dat), method = "euclidean") image(as.matrix(d)) hc <- hclust(d, method = "ward") plot(hc) # plot(hc, labels = ER_mRNA) # Highlight samples by ER status pal <- c("purple", "darkgreen", "gray") grp <- factor(ER_IHC) # Add the grp color coding library(dendextend) dend <- as.dendrogram(hc) labels_colors(dend) <- pal[grp][order.dendrogram(dend)] plot(dend) legend("topright", legend=levels(grp), fill=pal, cex=1.5) #---------------------------------------------------------------- # For Hierarchical Clustering on filtered data dat <- exprs(esetSub) d <- dist(t(dat), method = "euclidean") image(as.matrix(d)) hc <- hclust(d, method = "ward") plot(hc) dend <- as.dendrogram(hc) labels_colors(dend) <- pal[grp][order.dendrogram(dend)] plot(dend) legend("topright", legend=levels(grp), fill=pal, cex=1.5) #---------------------------------------------------------------- # For Hierarchical Clustering on Top 100 most variable genes dat <- exprs(eset) genes.var <- apply(dat, 1, var) genes.var.select <- order(genes.var, decreasing = T)[1:100] dat.s <- dat[genes.var.select, ] d.s <- dist(t(dat.s), method = "euclidean") hc.s <- hclust(d.s, method = "ward") plot(hc.s) dend <- as.dendrogram(hc.s) labels_colors(dend) <- pal[grp][order.dendrogram(dend)] plot(dend) legend("topright", legend=levels(grp), fill=pal, cex=1.5) #---------------------------------------------------------------- # For Hierarchical Clustering on Top 100 most variable genes library("ComplexHeatmap") library("dendextend") dat <- exprs(eset) genes.var <- apply(dat, 1, IQR) genes.var.select <- order(genes.var, decreasing = T)[1:100] dat.s <- dat[genes.var.select, ] d.s <- dist(t(dat.s), method = "euclidean") hc.s <- hclust(d.s, method = "ward") plot(hc.s) dend <- as.dendrogram(hc.s) labels_colors(dend) <- pal[grp][order.dendrogram(dend)] plot(dend) legend("topright", legend=levels(grp), fill=pal, cex=1.5) # Heatmap visualization of Top 100 most variable genes by IQR mat <- dat.s dend = hclust(dist(mat, method = "euclidean"), method="ward.D2") dend = color_branches(dend, k = 3) Heatmap(mat, name = "log2 expression", cluster_rows = dend, row_names_gp=gpar(fontsize=6), column_names_gp=gpar(fontsize=6)) #---------------------------------------------------------------- # Heatmap visualization of Top 100 most variable genes by IQR # with pData information genes.var <- apply(dat, 1, IQR) genes.var.select <- order(genes.var, decreasing = T)[1:100] dat.s <- dat[genes.var.select, ] z <- dat.s mat <- t(scale(t(z))) data <- data.frame(p53status, ER_mRNA, ER_IHC, grade, qc) rownames(data) <- rownames(phenoData) ha = HeatmapAnnotation(df = data[, 1:4], col = list(p53status = c("MUT" = "brown1", "WT" = "black", "n/a" = "gray"), ER_mRNA = c("ERneg" = "black", "ERpos" = "darkorchid1"), ER_IHC= c("ERneg" = "black", "ERpos" = "darkorchid4", "NA" ="white"), grade = c("1" = "darkblue", "2" = "darkmagenta", "3" = "darkgreen", "NA"="white"))) Heatmap(mat, name = "scaled values", cluster_rows = dend, row_names_gp=gpar(fontsize=6), column_names_gp=gpar(fontsize=9), top_annotation=ha) #---------------------------------------------------------------- # Gene and sample distance DATA VISUALISATION library("bioDist") dat <- exprs(eset) genes.var <- apply(dat, 1, IQR) genes.var.select <- order(genes.var, decreasing = T)[1:100] eS <- esetSub[genes.var.select, ] # To get the closest related genes sp = spearman.dist(eS, sample=FALSE) f1 = featureNames(eS)[1] closest.top(f1, sp, 3) # For sample-distance heatmaps library("RColorBrewer") hmcol <- colorRampPalette(brewer.pal(10,"RdBu"))(256) pal <- c("purple", "darkgreen", "gray") grp <- factor(ER_mRNA) small.eset <- exprs(eS) d.sample.euc <- euc(t(small.eset)) heatmap(as.matrix(d.sample.euc),sym=TRUE,col=hmcol, main='Between-sample distances (Euclidean)',labCol=NA, xlab='cell type', cexRow=0.5, ColSideColors=pal[grp]) # For gene-distance heatmaps d.gene.cor <- cor.dist(small.eset) heatmap(as.matrix(d.gene.cor),sym=TRUE,col=hmcol, main='Between-gene distances (Pearson)', xlab='probe set id',labCol=NA, cexRow=0.5) #---------------------------------------------------------------- # Get the gene names for the related genes in the gene-distance plot M1 <- as.matrix(d.gene.cor) head(M1) library("hgu133a.db") probeids <- rownames(M1) geneSymbols <- unlist(mget(probeids, hgu133aSYMBOL, ifnotfound=NA)) getNewNames <- function(x) ifelse(is.na(x), names(x), x) rownames(M1) <- getNewNames(geneSymbols) heatmap(M1,sym=TRUE,col=hmcol, main='Between-gene distances (Pearson)', xlab='probe set id',labCol=NA, cexRow=0.5) #---------------------------------------------------------------- # Get the list of DEG gene betweeen grade3 and other cancers # Run DEG with limma library("limma") # Define the groups for differential expression grade3status <- function(x) ifelse(x=="3", "grade 3", "other") grp <- factor(grade3status(grade), levels=c("other", "grade 3")) # Get filtered expression data M1 <- exprs(esetSub) dim(M1) # 5284 x 103 # define model matrix for lmFit design <- model.matrix(~grp) head(design) # Fit models fit <- lmFit(M1, design) fit <- eBayes(fit) res <- topTable(fit, coef="grpgrade 3", n="Inf", adjust.method="BH") dim(res) head(res) # Add gene names require("hgu133a.db") probeids <- rownames(res) geneSymbols <- unlist(mget(probeids, hgu133aSYMBOL, ifnotfound=NA)) gene_symbols <- getNewNames(geneSymbols) resPrint <- cbind(res, gene_symbols) # Save the table as a txt file write.table(resPrint, "DEG_Her2_Grade3vsOthers.txt", row.names=TRUE, col.names=NA, quote=FALSE, sep="\t") # To just print the probes that were significant by adjusted p value resB <- topTable(fit, coef="grpgrade 3", n="Inf", adjust.method="BH", p.value=0.05) dim(resB) # 2578 x 6 #---------------------------------------------------------------- # Heatmap for the DEG genes between Grade 3 HER2+ tumors and Other types library("gplots") grp <- factor(grade) grp <- relevel(grp, ref="3") pal <- c("darkgreen", "cyan", "darkblue", "gray") # Heatmap with scaled expression values heatmap(M2,sym=FALSE,col=hmcol, ColSideColors=pal[grp], xlab='HER2+ BC Samples', labCol=NA) # or with heatmap.2() function # Print Heatmap with the legend heatmap.2(M2,scale="row",col=hmcol, ColSideColors=pal[grp], xlab='HER2+ BC Samples', labCol=NA, trace="none", key.title="") legend("topright", legend=levels(grp), fill=pal, cex=0.6, title="Grade") #---------------------------------------------------------------- # Example DESeq2 script ### Do not run ### # Get the sample table to run the analysis for all samples -------------------------------------------- sampleTable <- data.frame(sampleName=sampleNames, fileName=sampleFiles, condition=sampleCondition) sampleTable # Get the desq2 data object ddsHTSeq <- DESeqDataSetFromHTSeqCount(sampleTable=sampleTable, directory=directory, design=~condition) # Save a table of the counts data countsTable <- counts(ddsHTSeq) head(countsTable) dds <- DESeq(ddsHTSeq) res <- results(dds) summary(res) # Doing the normalization after filtering genes that are not expressed across the samples dds1 <- dds[rowSums(counts(dds)) > 1, ] rld1 <- rlogTransformation(dds1, blind=TRUE) # local regression fit was used to transform the data head(assay(rld1)) # Run the second part of the analysis dds <- DESeq(dds1) res<-results(dds) summary(res) res<-res[order(res$padj),] # Plot MA plot with DEG genes highlighted in red plotMA(res) # Save the results as a csv file write.csv(as.data.frame(res), file=paste(fname, ".csv", sep=""))