index.H {clusterSim} | R Documentation |
Calculates Hartigan index
index.H (x,clall,d=NULL,centrotypes="centroids")
x |
data |
clall |
Two vectors of integers indicating the cluster to which each object is allocated in partition of n objects into u and u+1 clusters |
d |
optional distance matrix, used for calculations if centrotypes="medoids" |
centrotypes |
"centroids" or "medoids" |
See file $R_HOME\library\clusterSim\pdf\indexH_details.pdf for further details
Hartigan index
Marek Walesiak marek.walesiak@ue.wroc.pl, Andrzej Dudek andrzej.dudek@ue.wroc.pl
Department of Econometrics and Computer Science, University of Economics, Wroclaw, Poland http://keii.ue.wroc.pl/clusterSim
Hartigan, J. (1975), Clustering algorithms, Wiley, New York.
Milligan, G.W., Cooper, M.C. (1985), An examination of procedures of determining the number of cluster in a data set, "Psychometrika", vol. 50, no. 2, 159-179.
Tibshirani, R., Walther, G., Hastie, T. (2001), Estimating the number of clusters in a data set via the gap statistic, "Journal of the Royal Statistical Society", ser. B, vol. 63, part 2, 411-423.
index.G1
, index.G2
, index.G3
,
index.S
, index.KL
, index.Gap
, index.DB
# Example 1 library(clusterSim) data(data_ratio) cl1<-pam(data_ratio,4) cl2<-pam(data_ratio,5) clall<-cbind(cl1$clustering,cl2$clustering) index.H(data_ratio,clall) # Example 2 library(clusterSim) data(data_ratio) md <- dist(data_ratio, method="euclidean") # nc - number_of_clusters min_nc=1 max_nc=20 min <- 0 res <- array(0, c(max_nc-min_nc+1, 2)) res[,1] <- min_nc:max_nc found <- FALSE clusters <- NULL for (nc in min_nc:max_nc) { print(nc) hc <- hclust(md, method="complete") cl1 <- cutree(hc, k=nc) cl2 <- cutree(hc, k=nc+1) clall <- cbind(cl1,cl2) res[nc-min_nc+1,2] <- H <- index.H(data_ratio,clall,centrotypes="centroids") if ((res[nc-min_nc+1, 2]<10) && (!found)){ nc1 <- nc min <- H clopt <- cl1 found <- TRUE } } if (found) { print(paste("minimal nc for H<=10 equals",nc1,"for H=",min)) print("clustering for minimal nc where H<=10") print(clopt) }else { print("Clustering not found with H<=10") } write.table(res,file="H_res.csv",sep=";",dec=",",row.names=TRUE,col.names=FALSE) plot(res,type="p",pch=0,xlab="Number of clusters",ylab="H",xaxt="n") abline(h=10, untf=FALSE) axis(1, c(min_nc:max_nc)) # Example 3 library(clusterSim) data(data_ratio) md <- dist(data_ratio, method="manhattan") # nc - number_of_clusters min_nc=1 max_nc=20 min <- 0 res <- array(0, c(max_nc-min_nc+1, 2)) res[,1] <- min_nc:max_nc found <- FALSE clusters <- NULL for (nc in min_nc:max_nc) { print(nc) hc <- hclust(md, method="complete") cl1 <- cutree(hc, k=nc) cl2 <- cutree(hc, k=nc+1) clall <- cbind(cl1,cl2) res[nc-min_nc+1,2] <- H <- index.H(data_ratio,clall,d=md,centrotypes="medoids") if ((res[nc-min_nc+1, 2]<10) && (!found)){ nc1 <- nc min <- H clopt <- cl1 found <- TRUE } } if (found) { print(paste("minimal nc for H<=10 equals",nc1,"for H=",min)) print("clustering for minimal nc where H<=10") print(clopt) }else { print("Clustering not found with H<=10") } write.table(res,file="H_res.csv",sep=";",dec=",",row.names=TRUE,col.names=FALSE) plot(res,type="p",pch=0,xlab="Number of clusters",ylab="H",xaxt="n") abline(h=10, untf=FALSE) axis(1, c(min_nc:max_nc))