## Software for implementing methods in ## Smolkin and Ghosh (2002) # Ben-Hur algorithm-uses jaccard measure to aid in determining the true number # of clusters in a dataset # Important Note!!! There are two options when using the "benhur" function # Option 1- obtain separate histograms displaying the distributions of correlation # coefficients for various cluster sizes # Option 2- obtain one plot containing the cumulative distributions of correlation # coefficients for each cluster size overlayed into one plot # If Option 1 is desired, make sure that comment symbols (#) are added to the start of each # line of an 8-line block of code near the end of the function- Below lines 1 and 8 are # presented to clarify the lines in question: # line 1: #par(mfrow=c(1,1)) # line 8: #par(new=FALSE) # If Option 2 is desired, remove comment symbols (#) from these eight lines. # Explanation of Function Parameters- # data- specify name of dataset # freq- proportion of samples to use (should exceed 0.5) # linkmeth- denote which linkage method (ie. average, complete, etc.) # upper- algorithm will be performed for cluster sizes ranging from 2 to the value # specified for this parameter # iterations- # of times to repeat process for each cluster size (100 is reasonable) # seednum- enter starting seed number # Explanation of Output if Option 1 is chosen- # The output of the "benhur" function consists of a grid of histograms. Each # histogram represents the distribution of correlation coefficients a particular # cluster size, k. Histograms are created for all cluster sizes from 2 to the # user-specified value for "upper". The distribution for 2 clusters is found in the # top-left corner. Moving from left to right shows the histogram for the next smallest # size. # Function- benhur<-function(data,freq,linkmeth="average",upper,iterations,seednum) { set.seed(seednum) library(mva) library(cluster) size<-ceiling(((upper-1)/4)) par(mfrow=c(size,4)) samples<-dim(data)[1] n<-dim(data)[2] jaccardmatrix<-NULL for(k in 2:upper) { jaccardvec<-NULL for(m in 1:iterations) { sampleone<-sample(1:samples,(freq*samples),replace=F) sampletwo<-sample(1:samples,(freq*samples),replace=F) subone<-data[sampleone,] subtwo<-data[sampletwo,] clustone<-hclust(dist(subone),method=linkmeth) cutone<-cutree(clustone,k) clusttwo<-hclust(dist(subtwo),method=linkmeth) cuttwo<-cutree(clusttwo,k) # Checking for intersection inboth<-NULL placeone<-NULL placetwo<-NULL for (i in 1:samples) { c<-0 if (length(sampleone[sampleone==i])==1) c<-1 if (length(sampletwo[sampletwo==i])==1) c<-c+1 if (c==2) inboth<-c(inboth,i) } order<-c(1:length(sampleone)) # Determining which individuals were subsetted to both samples for (g in 1 :length(inboth)) { placeone[g]<-cutone[order[sampleone==inboth[g]]] placetwo[g]<-cuttwo[order[sampletwo==inboth[g]]] } # Setting up matrices of 0's and 1's for each sampled group Cone<-matrix(0,length(placeone),length(placeone)) Ctwo<-Cone # Now to fill in some 1's where individuals are in the same group for (y in 1:(length(placeone)-1)) { for (z in (y+1):length(placeone)) { if (placeone[y]==placeone[z]) Cone[y,z]<-1;Cone[z,y]<-1 if (placetwo[y]==placetwo[z]) Ctwo[y,z]<-1;Ctwo[z,y]<-1 } } jaccard<-sum(Cone*Ctwo)/(sum(Cone*Cone)+sum(Ctwo*Ctwo)-sum(Cone*Ctwo)) jaccardvec<-cbind(jaccardvec,jaccard) } # Histogram of jaccards for that value of k hist(jaccardvec,25,freq=TRUE,main=k) # Make a matrix of jaccards-each row contains the 100 sorted jaccards for that k jaccardsort<-sort(jaccardvec) jaccardmatrix<-rbind(jaccardmatrix,jaccardsort) jaccardvec<-NULL # Remove comments on the following 8 lines only if you are interested in obtaining # the overlayed cumulative distributions of correlation coefficients for all cluster # sizes and not separate histograms for each cluster size #par(mfrow=c(1,1)) #seqvec<-seq(0.01,1,by=0.01) #for (mm in 1:dim(jaccardmatrix)[1]) #{ # plot(jaccardmatrix[mm,],seqvec,type="s",cex=0.1,xlim=range(0,1),ylim=range(0,1)) # par(new=TRUE) #} #par(new=FALSE) } } # This program determines the rates at which exact clusters reappear # when sampling is performed using only a proportion of the available # genes. This program will first cluster samples using all of the # available genes. Then, using only a user-determined fraction of the available # genes, clustering will be performed a user-defined number of times. The # final result relates the rates, in percentages, at which original clusters # were exactly replicated out of the total number of clusterings performed. # For example, an original clustering using all available genes may have # led to a cluster comprised of observations 1,4,6,8,14,16. This program will # output the percentage of times that this exact cluster configuration appeared # when only a random subset of the genes were used. A rate of 95% indicates # that 95% of the time, this exact cluster reappeared when only a subset of # the available genes were used. The program provides percentages for each of # the clusters in the original data. # Notes- Parameters # data= name of dataset # m= # of samples (rows) # n= # of genes (columns) # iterations= # of times to perform random clustering excluding a # specified number of genes # randgene= # of genes to include in random clustering # cl= # of clusters # method= clustering linkage method (i.e. average, complete, etc.) # seednum= starting seed number # Output- # Each row of "Clusters" contains a listing of the observations placed into # one of the "cl" clusters when clustering utilized all of the available genes. # Trailing zeros occur after the last observation for that cluster. # "Percentages" is a vector containing elements representing the rates at which # exact clusters reappeared using randomly chosen gene subsets. The position of the # elements corresponds to the rows of "Clusters". For example, the first element of # "Percentages" is the percentage of times that the cluster represented by the first # row of "Clusters" reappeared using random gene-subsets. # "ClusterNumber"- restates number of clusters user specified # "GenesUsed"- restates number of genes to use for random clustering # Function random<-function(data,iterations,randgene,cl,linkmeth="average",seednum) { library(mva) library(cluster) m<-dim(data)[1] n<-dim(data)[2] set.seed(seednum) # Revdata will contain the data after randomingly chosing (#randgene) # genes revdata<-rep(0,times=(m*randgene)) dim(revdata)<-c(m,randgene) # Perform original clustering on an entire dataset # 'Realmatrix' is a matrix with 'cl' rows and with the number of columns equal to # the size of the largest cluster # Each of row of 'Realmatrix' contains the observations that form one of the 'cl' clusters # sorted in numerical order---trailing zeroes are at the ends of rows where that cluster # has fewer observations than does the largest cluster original<-hclust(dist(data),method=linkmeth) orig<-cutree(original,cl) longest<-rep(0,times=cl) for (qq in 1:cl) { longest[qq]<-length(orig[orig==qq]) } colsize<-max(longest) realmatrix<-matrix(0,cl,colsize) ordervec<-seq(1:m) for (aa in 1:cl) { realmatrix[aa,1:length(orig[orig==aa])]<-sort(ordervec[orig==aa]) } # Now to perform random clustering and tabulate occurrences of true clusters # when only a subset of genes are used # 'Revdata' is the name of the dataset containing the subset of genes tally<-rep(0,times=cl) for (kk in 1:iterations) { values<-sample(1:n,size=randgene,replace=F) revdata<-data[,values] randclustering<-hclust(dist(revdata),method=linkmeth) randclust<- cutree(randclustering,cl) for(jj in 1:cl) { if (length(randclust[randclust==jj])>colsize) next for(tt in 1:cl) { randclustvec<-rep(0,times=colsize) sorted<-sort(ordervec[randclust==jj]) randclustvec[1:length(sorted)]<-sorted diff<-realmatrix[tt,]-randclustvec absdiff<-abs(diff) sum<-sum(absdiff) if (sum==0) tally[tt]<-tally[tt]+1 } } } percentages<-tally/iterations result<-list(Clusters=realmatrix, Percentages=percentages, ClustNumber=cl, GenesUsed=randgene) print(result) }