################################################################################################# # PLEASE READ "DIRECTION FOR USING THE R CODE.PDF" BEFORE USING THE FOLLOWING CODE! # #-----------------------------------------------------------------------------------------------# # File Description: # # The following code performs Fisher's exact test, EASE analysis, Wilcoxin Mann-Whitney test, # # and two-sample Kolmogorov-Smirnov test to identify over-represented functional gene category,# # and performs one-sample Kolmogorov-Smirnov test to identify differentially expressed # # functional category, in a microarray experiment. The code also returns the corresponding # # ECDF and ROC curve plots. # #-----------------------------------------------------------------------------------------------# # Programmer: Hua Liu # # Department of Statistics, University of Kentucky, Lexington, KY 40506 # # hualiu@ms.uky.edu # # Date: September 20, 2005 # ################################################################################################# ################################################################################################# # (1). Set up the working directory: # ################################################################################################# setwd("type your directory here") # for example: setwd("c:/fca") ################################################################################################ # (2). Import data: # ################################################################################################ population=read.csv("type the name of your population data here.csv",header = FALSE) # for example: population=read.csv("population.csv",header = FALSE) fc=as.matrix(read.csv("type the name of your functional gene category data here.csv",header = FALSE)) # for example: fc=as.matrix(read.csv("apoptosis.csv",header = FALSE)) ################################################################################################# # (3). Set up the alpha level: # ################################################################################################# alpha=0.05 ################################################################################################ # (4). Functions that perform functional category analyses and draw plots: # # Function "split" seperates input p-values in the population data set according to # # whether a probeset is in the functional gene category or not; function "fca" returns # # p-values from Fisher's exact test, Wilcoxin Mann-Whitney test, two-sample and one-sample# # sample Kolmogorov-Smirnov tests, and EASE score; functions "ecdf.plot", "ecdf.part", # # and "roc.plot" provide ECDF plot, emlargement of the ECDF plot, and ROC curve plot, # # respectively. # ################################################################################################ # below is a 2x2 table used by Fisher's exact test, where the significance is definded as if # the p-value obtained from the test for differential gene expression is less than or equal to # a pre-specified alpha level. # ----------------------------------------------------------------- # | in the category not in the category | # | --------------------------------------------| # | significant | n.fc.s n.other.s | # | non-significant | n.fc.ns n.other.ns | # ----------------------------------------------------------------- split=function(population,fc) { # add a column of "0" to the file "population" pop1=cbind(population,0) # if the probeset is in the functional category, replace replace 0 with 1 for (i in 1:nrow(fc)){ pop1[pop1[,1]==fc[i,],3]=1 } return(pop1) } fca=function(fc.p,other.p) { # alpha defines significance level for 2x2 table # count number of genes in each of the 4 cells in 2x2 table: n.fc.s=length(fc.p[fc.p<=alpha]) n.fc.ns=length(fc.p[fc.p>alpha]) n.other.s=length(other.p[other.p<=alpha]) n.other.ns=length(other.p[other.p>alpha]) # Fisher's exact test p-value: mat.0=matrix(c(n.fc.s,n.fc.ns,n.other.s,n.other.ns),nr=2,dimnames=list(c("s","ns"),c("fc","other"))) Fisher.pvalue=fisher.test(mat.0,alternative="greater")$p.value # Jackknife the category and perform corresponding Fisher's exact tests: mat.1=matrix(c(n.fc.s,n.fc.ns-1,n.other.s,n.other.ns),nr=2,dimnames=list(c("s","ns"),c("fc","other"))) fc.ns.pvalue=fisher.test(mat.1,alternative="greater")$p.value mat.2=matrix(c(n.fc.s-1,n.fc.ns,n.other.s,n.other.ns),nr=2,dimnames=list(c("s","ns"),c("fc","other"))) fc.s.pvalue=fisher.test(mat.2,alternative="greater")$p.value # EASE score based on p-value cut-off at alpha: EASE.score=max(c(Fisher.pvalue,fc.s.pvalue,fc.ns.pvalue)) # Wilcoxon Mann-Whitney test: WMW.pvalue=wilcox.test(fc.p, other.p, alternative = "less")$p.value # two sample Kolmogorov-Smirnov test: KS2.pvalue=ks.test(fc.p, other.p, alternative = "gr")$p.value # one sample Kolmogorov-Smirnov test: KS1.pvalue=ks.test(fc.p, "punif", 0, 1, alternative = "gr")$p.value pvals=rbind(EASE.score,Fisher.pvalue,WMW.pvalue,KS2.pvalue,KS1.pvalue) return(pvals) } ecdf.plot=function (fc,other) { plot(ecdf(other),verticals=TRUE,main=NULL,do.p=FALSE,xlab="alpha",ylab="ECDF",xlim=c(.0345,0.963), ylim=c(0.03655,0.96295),lwd=2) lines(c(0,1), c(0,1),col="blue",lwd=2) lines(c(0,sort(fc),1),seq(0,1,1/length(c(0,sort(fc)))),col="magenta",lwd=2,type="s") b=max(sum(fc<=.053)/length(fc),sum(other<=.053)/length(other))+0.03 lines(c(0.053,0.053),c(0,b),col="black",lwd=1,lty=2) lines(c(0,0.053),c(b,b),col="black",lwd=1,lty=2) lines(c(0,1), c(0,0),col="black",lwd=1) lines(c(0,1), c(1,1),col="black",lwd=1) } ecdf.part=function (fc,other) { b=max(sum(fc<=.053)/length(fc),sum(other<=.053)/length(other))+0.02 plot(ecdf(other),verticals=TRUE,main=NULL,do.p=FALSE,xlab="alpha",ylab="ECDF",xlim=c(0.001,0.052),ylim=c(0,b),lwd=2,cex=1.2) lines(c(0,1), c(0,1),col="blue",lwd=2) lines(c(0,sort(fc),1),seq(0,1,1/length(c(0,sort(fc)))),col="magenta",lwd=2,type="s") max05=max(sum(fc<=.05)/length(fc),sum(other<=.05)/length(other)) min05=min(sum(fc<=.05)/length(fc),sum(other<=.05)/length(other)) fc05=sum(fc<=.05)/length(fc) max01=max(sum(fc<=.01)/length(fc),sum(other<=.01)/length(other)) min01=min(sum(fc<=.01)/length(fc),sum(other<=.01)/length(other)) fc01=sum(fc<=.01)/length(fc) max001=max(sum(fc<=.001)/length(fc),sum(other<=.001)/length(other)) min001=min(sum(fc<=.001)/length(fc),sum(other<=.001)/length(other)) fc001=sum(fc<=.001)/length(fc) lines(c(.05,.05),c(min05,fc05),col="green",lwd=2) lines(rep(.05,length(seq(0,min05,min05/10))),seq(0,min05,min05/10),col="green",lwd=1,lty=2) lines(c(.01,.01),c(min01,fc01),col="green",lwd=2) lines(rep(.01,length(seq(0,min01,min01/10))),seq(0,min01,min01/10),col="green",lwd=1,lty=2) lines(c(.001,.001),c(min001,fc001),col="green",lwd=2) lines(rep(.001,length(seq(0,min001,min001/10))),seq(0,min001,min001/10),col="green",lwd=1,lty=2) legend(0, b, "enlargement",bty="n",cex=1) } roc.plot=function(fc,other) { fc=sort(fc) other=sort(other) alpha=seq(1/length(other),1,1/length(other)) fginv=rep(0,length(other)) for (i in 1:length(other)) { fginv[i]=length(fc[fc<=other[i]])/length(fc) } plot(alpha,fginv,type="l",col="red",xlab="alpha",ylab="ROC",xlim=c(.036,0.963),ylim=c(0.03655,0.96295),lwd=2) lines(c(0,1), c(0,1),col="blue",lwd=2) } ################################################################################################ # (5). Use the functions in (3) to perform functional category analysis and draw plots, # # Four files will be exported to the directory specified under (1), including three plots,# # and a csv file contains p-values of Fisher's exact test, Wilcoxon Mann-Whitney test, # # two-sample and one-sample Kolmogorov-Smirnov tests, and EASE score. # ################################################################################################ pop1=split(population,fc) fc.p=pop1[pop1[,3]==1,2] other.p=pop1[pop1[,3]==0,2] results=fca(fc.p,other.p) write.table(results,file="FCA-pvals.csv",sep=",",col.names="") jpeg("FCA-ECDF plot.jpg") ecdf.plot(fc.p,other.p) dev.off() jpeg("FCA-ECDF plot enlargement.jpg") ecdf.part(fc.p,other.p) dev.off() jpeg("FCA-ROC curve plot.jpg") roc.plot(fc.p,other.p) dev.off()