# $Id: xmeans.prog,v 1.23 2012/04/12 11:21:12 tunenori Exp tunenori $ # # X-MEANS Clustering # # Description: # # Perform x-maens non-hierarchal clustering on a data matrix. # # Usage: # # xmeans(x, ik = 2, iter.max = 10, pr.proc = F, # ignore.covar = T, merge.cls = F) # # Arguments: # x: A numeric matrix of data, or an object that can be coerced to # such a matrix (such as a numeric vector or a data frame with # all numeric columns). # # ik: The initial number of clusters applied to kmeans(). # As xmeans calls kmeans recursively, `ik' should be sufficient # small. # # iter.max: The maximum of iterations allowed. # # pr.proc: logical: If 'TRUE' the system outputs the processing status. # # ignore.covar: logical: If 'TRUE', covariances of cluster data are # ignored. For saving of the time, 'TRUE' is set as the defalut. # # merge.cls: logical: If 'TRUE', some clusters may be merged into another # clusters after iterative division. # # Value: # # An object of class 'xmeans' which is a list with components: # # cluster: A vector of integers indicating the cluster to which each # point is allocated. # # centers: A matrix of cluster centres. # # size: The number of points in each cluster. When `merge.cls' is TRUE, # some elements may be zero. # # References: # # Ishioka, T. (2005): ``An Expansion of X-means for Automatically # Determining the Optimal Number of Clusters," The Fourth IASTED # International Conference on Computational Intelligence (CI 2005), # Calgary Canada, July 4-6, pp.91-96. # http://www.rd.dnc.ac.jp/%7Etunenori/doc/487-053.pdf # # Ishioka, T. (2000): ``Extended K-means with an Efficient Estimation # of the number of Clusters,'' Intelligent Data Engineering and # Automated Learning --- IDEAL 2000, Second International Conference, # Shatin, N.T., Hong Kong, China, December 2000, proceedings 17--22. # (Lecture Notes in Computer Science 1983, Kwong Sak Leung, Lai-Wan # Chan, Helen Meng (Eds.), Springer, 17--22, 2000) # http://www.rd.dnc.ac.jp/%7Etunenori/doc/xmeans_ideal2000.pdf # # Examples: # # xmeans(iris[,-5], merge.cls=T) # plot(cmdscale(dist(iris[,-5])), cex=2, pch=as.numeric(iris[,5])) # xmeans <- function(x, ik = 2, iter.max = 10, pr.proc = F, ignore.covar = T, merge.cls = F){ if (ik < 2) ik <- 2 x <- as.matrix(x) p <- ncol(x) # p-dimensional multivariate if (ignore.covar){ q <- 2 * p # number of parameters; mean and var for each "p" }else{ q <- p * (p+3) / 2 # integer } cl<- kmeans(x,ik,iter.max) cl.sub <- list() for (i in 1:ik){ # for each cluster y.ok <- (cl$cluster == i) # i-th cluster or not yi <- matrix(x[y.ok], ncol=p) # extract i-th cluster zi <- yi # save the data for graphics yi.centers <- cl$centers[i,] zi.centers <- yi.centers yi.cluster <- cl$cluster[(cl$cluster == i)] yi.cluster <- rep(1, length(yi.cluster)) # sub-cluster number should begin from 1 k1 <- 1 # cluster number k2 <- k1 + 1 bic.prior <- NULL stack <- list() # divided and unproceeded data are stacked lnL0 <- lnL(yi, yi.centers, ignore.covar) yi.lnL <- lnL0$lnL yi.detVx <- lnL0$detVx repeat{ # go through at least 1 time; # y$subcluster exist... if (pr.proc) cat (paste("k1 =", k1, ", k2 =", k2,"\n")) if (nrow(yi) == 1){ # sample size is 1 break } y <- split2cls(yi, yi.centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar) if (y$continue){ # splitting continue yi.cluster <- updtCrusterNum(y$continue, yi.cluster, k1, k2, y$subcluster) zi.centers <- updtCenters(y$continue, zi.centers, k1, k2, y$centers) yi.lnL <- updtlnL(y$continue, yi.lnL, k1, k2, y$lnL.post) yi.detVx <- updtdetVx(y$continue, yi.detVx, k1, k2, y$detVx.post) } if (pr.proc) print(y$subcluster) if (pr.proc){ print(y$bic.prior) print(y$bic.post) # print(y$lnL.prior) # for debug # print(y$lnL.post) # for debug print(y$continue) } # cat("zi.centers=\n") # for debug # print(zi.centers) # for debug if (!y$continue){ # no-node if ((nstack <- length(stack))){ # there are stacked data # extract the stacked data if (pr.proc) cat(paste("extract the stacked data (", nstack, ")...\n")) yi <- stack[[nstack]]$data yi.centers <- stack[[nstack]]$centers bic.prior <- stack[[nstack]]$bic lnL.prior <- stack[[nstack]]$lnL detVx.prior <- stack[[nstack]]$detVx k1 <- stack[[nstack]]$cls k2 <- k2 # unchanged # delete the data set if (nstack > 1){ stack <- stack[1:(nstack-1)] }else{ stack <- list() # no stacked data } next; } # no node and no stack if (pr.proc) cat ("no node and no stack...\n") break; } # splitting continues... y1 <- y$clj1 # data y2 <- y$clj2 yi.ctr1 <- y$centers[1,] # centers yi.ctr2 <- y$centers[2,] bic.prior1 <- y$bic.post[1] # bic bic.prior2 <- y$bic.post[2] lnL.prior1 <- y$lnL.post[1] # lnL lnL.prior2 <- y$lnL.post[2] detVx.prior1 <- y$detVx.post[1] # detVx detVx.prior2 <- y$detVx.post[2] # one-hand repeats recursively... yi <- y1 yi.centers <- yi.ctr1 bic.prior <- bic.prior1 lnL.prior <- lnL.prior1 detVx.prior <- detVx.prior1 # other-hand is stacked... if (pr.proc) cat ("stacking ...\n") stack <- c(stack, list(list(data=y2, centers=yi.ctr2, bic=bic.prior2, lnL=lnL.prior2, detVx=detVx.prior2, cls=k2))) # inclement the cluster number k2 <- k2 + 1 } # end of repeat # splitting done ... if (pr.proc){ cat ("splitting done...\n") cat (paste("main cluster =",i,"*******\n")) } cl.sub <- c(cl.sub, list(list(cluster = yi.cluster, centers = zi.centers, lnL = yi.lnL, detVx = yi.detVx, size = tabulate(yi.cluster)))) if (pr.proc){ print(cl.sub[[i]]) plot(zi, col=yi.cluster) if (is.vector(zi.centers)) points(zi.centers[1], zi.centers[2], pch=8) else # array points(zi.centers,col=1:(length(zi.centers)/p),pch=8) } } if (pr.proc) print(cl.sub) xcl <- mergeResult(cl, cl.sub, ik) if (merge.cls == F) { return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size)) } # merge after progressive dividing # if (pr.proc) cat("merging after progressive dividing ...\n") k <- length(xcl$size) # final cluster number if (k <= 2){ # minimum cluster number should be 2 if (pr.proc) cat("merging skipped ...\n") return(list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size)) } if (pr.proc){ cat("xcl$detVx=") print(xcl$detVx) cat("xcl$size=") print(xcl$size) } klist <- sort.list(xcl$size) # "small" to "large" order of xcl$detVx list if (pr.proc) print(klist) for (i in 1:(k-1)){ for (j in (i+1):k){ k1 = klist[i] k2 = klist[j] if (pr.proc) cat(paste("inspecting the clusters", k1,"and", k2,"\n")) z <- mergedBIC(x, xcl, k1, k2, q, ignore.covar, pr.proc) if (z$ret == F){ # k1 or k2 has been merged. # skip this roop if (pr.proc) cat("skipping... k1=", k1, "k2=", k2,"\n") next } if (z$bicdiv > z$bicmgd){ # we prefer merged model. # replace larger cls. number to smaller cls. number if (pr.proc) cat("replace cls.", k2, "to", k1,"\n") xcl$cluster <- replace(xcl$cluster, (xcl$cluster == k2), k1) xcl$size[k1] <- xcl$size[k1] + xcl$size[k2] xcl$size[k2] <- 0 xcl$lnL[k1] <- z$lnLmgd xcl$lnL[k2] <- 0 xcl$detVx[k1] <- z$detVxmgd xcl$detVx[k2] <- 0 xcl$centers[k1,] <- z$ctrmgd xcl$centers[k2,] <- 0 } } } list(cluster = xcl$cluster, centers = xcl$centers, size = xcl$size) } # marge the result of sub-clustering; # cluster numbers by first kmeans should be renumbered; # the other centers and sizes are simply added. # cl: the result of first kmeans # cl.sub: the result of subclustering # ik: cluster number adopted to kmeans. mergeResult <- function(cl, cl.sub, ik){ cluster <- cl$cluster # main cluster centers <- NULL size <- NULL lnL <- NULL detVx <- NULL k <- 0 # uniq cluster numbers; k should be decremental. for (i in 1:ik) k <- k + length(cl.sub[[i]]$size) kk <- k for (i in ik:1){ # loop for main clusters obtained by kmeans xsub <- cl.sub[[i]]$cluster iki <- ik -i +1 centers <- rbind(centers, cl.sub[[iki]]$centers) size <- c(size, cl.sub[[iki]]$size) lnL <- c(lnL, cl.sub[[iki]]$lnL) detVx <- c(detVx, cl.sub[[iki]]$detVx) for (j in length(cl.sub[[i]]$size):1){ # loop for subclusters xsub <- replace(xsub, (xsub == j), k) k <- k -1 } cluster <- replace(cluster, (cluster == i), xsub) } if (k != 0) stop("mergeResult: assertion failed (k = 0)...") dimnames(centers) <- list(1:kk, NULL) list(cluster = cluster, centers = centers, lnL = lnL, detVx = detVx, size = size) } # update the cluster number by using the result of "split2cls()" # continue: no splitting # v: cluster numbers vector for initial cluster. # k1: cluster numbers should be updated; "k1" becomes "k1" and "k2" # xsub: sub-cluster numbers vector of "v" whose value is "k1"; # given "xsub" have 1 or 2. updtCrusterNum <- function(continue, v, k1, k2, xsub){ if (!is.vector(v)) return(xsub) if (!continue) return(v) if (k1 == k2) stop("updtCrusterNum() : k1 and k2 should differ.") # below is same algorithm; explicit array operation is slow in R. # j <- 1 # for (i in 1:length(v)){ # if (v[i] == k1){ # if (xsub[j] == 2) # v[i] <- k2 # j <- j + 1 # } # } # end of algorithm xsub <- replace(xsub, (xsub == 2), k2) # changed xsub <- replace(xsub, (xsub == 1), k1) # unchanged v <- replace(v, (v == k1), xsub) } # update the cluster centers by using the result of "split2cls()" # continue: no update # org.centers: original centers matrix # divided.centers: divided centers matrix; it has 2 rows. updtCenters <- function(continue, org.centers, k1, k2, divided.centers){ if (!is.matrix(org.centers)) return(divided.centers) if (!continue) return(org.centers) if (k1 == k2) stop("updtCenters() : k1 and k2 should differ.") z <- NULL for (i in 1:max(k2, nrow(org.centers))){ if (i == k1) z <- rbind(z, divided.centers[1,]) else if (i == k2) z <- rbind(z, divided.centers[2,]) else z <- rbind(z, org.centers[i,]) } z } # update the lnL by using the result of "split2cls()" # continue: no update # org.lnL: original lnL vector # divided.lnL: divided lnL vector having 2 elements. updtlnL <- function(continue, org.lnL, k1, k2, divided.lnL){ if (!is.vector(org.lnL)) return(divided.lnL) if (!continue) return(org.lnL) if (k1 == k2) stop("updtlnL() : k1 and k2 should differ.") z <- NULL for (i in 1:max(k2, length(org.lnL))){ if (i == k1) z <- c(z, divided.lnL[1]) else if (i == k2) z <- c(z, divided.lnL[2]) else z <- c(z, org.lnL[i]) } z } # update the detVx by using the result of "split2cls()" # continue: no update # org.detVx: original detVx vector # divided.detVx: divided detVx vector having 2 elements. updtdetVx <- function(continue, org.detVx, k1, k2, divided.detVx){ if (!is.vector(org.detVx)) return(divided.detVx) if (!continue) return(org.detVx) if (k1 == k2) stop("updtdetVx() : k1 and k2 should differ.") z <- NULL for (i in 1:max(k2, length(org.detVx))){ if (i == k1) z <- c(z, divided.detVx[1]) else if (i == k2) z <- c(z, divided.detVx[2]) else z <- c(z, org.detVx[i]) } z } # split 2 clusters if we would prefer it based on BIC # q: a number of parameters # bic.prior: BIC which x is given; if bic.prior=NULL then we calculate # lnL.prior: lnL which x is given; if bic.prior=NULL then we calculate # detVx.prior: detVx which x is given; if bic.prior=NULL then we calculate split2cls <- function(x, centers, q, bic.prior, lnL.prior, detVx.prior, iter.max, ignore.covar){ if (is.null(bic.prior)){ pb <- priorBIC(x, centers, q, ignore.covar) bic.prior <- pb$bic lnL.prior <- pb$lnL detVx.prior <- pb$detVx } bic.post <- postBICs(x, centers, q, iter.max, ignore.covar) subcluster <- bic.post$clsub$cluster # # compare whether if we should split if (is.na(bic.post$bic[3])){ # BIC may has NA because of few data continue <- FALSE }else if (bic.post$bic[3] < bic.prior){ # splitting ... # replace the cluster number to cl$cluster continue <- TRUE }else{ # not splitting... # return "subcluster" stored k1 continue <- FALSE } # note that "subcluster" gives 1 or 2 list(continue = continue, subcluster = subcluster, bic.prior = bic.prior, bic.post = bic.post$bic, lnL.prior = lnL.prior, lnL.post = bic.post$lnL, detVx.prior = detVx.prior, detVx.post = bic.post$detVx, centers = bic.post$clsub$centers, clj1 = bic.post$clj1, clj2 = bic.post$clj2) } # return BIC (prior BIC) priorBIC <- function(x, centers, q, ignore.covar){ lnL0 <- lnL(x, centers, ignore.covar) bic <- -2 * lnL0$lnL + q * log(nrow(x)) # BIC # bic <- -2 * lnL0$lnL + q # AIC list(lnL = lnL0$lnL, detVx = lnL0$detVx, bic = bic) } # return BICs (two posterior BICs) postBICs <- function(x, centers, q, iter.max, ignore.covar){ # # split to 2 clusters clsub <- kmeans(x, 2, iter.max) y.ok1 <- lapply(clsub$cluster, "==", 1) # 1st sub-cluster or not y.ok2 <- lapply(clsub$cluster, "==", 2) # 2nd sub-cluster or not # extract sub data p <- ncol(x) clj1 <- matrix(x[as.logical(y.ok1)], ncol=p) clj2 <- matrix(x[as.logical(y.ok2)], ncol=p) # ratio for pdf. r1 <- clsub$size[1] / sum(clsub$size) # [0,1] r2 <- 1 - r1 # [0,1] # two later BICs # print(clsub$centers[1,]) # for debug # print(apply(clj1,2,mean)) # for debug # print(sqrt(apply(clj1,2,var))) # for debug # print(r1) # for debug lnL1 <- lnL(clj1, clsub$centers[1,], ignore.covar) # print(clsub$centers[2,]) # for debug # print(apply(clj2,2,mean)) # for debug # print(sqrt(apply(clj2,2,var))) # for debug # print(r2) # for debug lnL2 <- lnL(clj2, clsub$centers[2,], ignore.covar) n1 <- nrow(clj1) n2 <- nrow(clj2) # normalizing factor; dist() is in library(mva) if (is.na(lnL1$detVx) || is.na(lnL2$detVx)) beta <- 0 else beta <- dist(clsub$center) / (sqrt(lnL1$detVx + lnL2$detVx)) alpha <- 0.5 / pnorm(beta) BIC1 <- -2 * lnL1$lnL +q * log(n1) BIC2 <- -2 * lnL2$lnL +q * log(n2) # BIC1 <- -2 * lnL1$lnL +q # AIC # BIC2 <- -2 * lnL2$lnL +q # AIC # cat (paste("alpha =",alpha,"\n")) # for debug # cat (paste("beta =",beta,"\n")) # for debug # BIC is not (BIC1 + BIC2) BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha) # BIC <- -2 * lnL1$lnL -2 * lnL2$lnL + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC list(bic = c(BIC1, BIC2, BIC), lnL = c(lnL1$lnL, lnL2$lnL), detVx = c(lnL1$detVx, lnL2$detVx), clsub = clsub, clj1 = clj1, clj2 = clj2) } # return BICs for Two-merged clusters model and devided clusters model # k1/k2: marged cluster ID mergedBIC <- function(x, xcl, k1, k2, q, ignore.covar, pr.proc){ # sample size # check for input data n1 <- xcl$size[k1] n2 <- xcl$size[k2] if (n1 == 0 || n2 == 0){ # already had been merged cat(paste("already had been merged\n")) ret <- F return( list (ret = ret)) } if (is.null(xcl$lnL[k1]) || is.null(xcl$lnL[k2])){ # lnL may be null because of few data cat(paste("lnL may be null because of few data\n")) ret <- F return( list (ret = ret)) } # divided clusters model lnL1 = xcl$lnL[k1] lnL2 = xcl$lnL[k2] ctrextrt <- rbind(xcl$centers[k1,], xcl$centers[k2,]) beta <- dist(ctrextrt) / (sqrt(xcl$detVx[k1] + xcl$detVx[k2])) if (pr.proc) cat(paste("beta=", round (beta, digit=2), "\n")) # if (beta > 10){ # # 2 clusters far apart # ret <- F # return( list (ret = ret)) # } alpha <- 0.5 / as.numeric(pnorm(beta)) bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q * log(n1 + n2) - 2 * (n1 + n2) * log(alpha) # bicdiv <- -2 * lnL1 -2 * lnL2 + 2 * q - 2 * (n1 + n2) * log(alpha) # AIC # extract 2 clusters data y.ok1 <- lapply(xcl$cluster, "==", k1) # 1st sub-cluster or not y.ok2 <- lapply(xcl$cluster, "==", k2) # 2nd sub-cluster or not # extract sub data p = ncol(x) clj1 <- matrix(x[as.logical(y.ok1)], ncol=p) clj2 <- matrix(x[as.logical(y.ok2)], ncol=p) xmgd <- rbind(clj1, clj2) # merged cluster center ctrmgd <- (n1 * xcl$centers[k1,] + n2 * xcl$centers[k2,]) / (n1 + n2) lnLmgd <- lnL(xmgd, ctrmgd, ignore.covar) bicmgd <- -2 * lnLmgd$lnL + q * log(nrow(xmgd)) # BIC # bicmgd <- -2 * lnLmgd$lnL + q # AIC ret <- T list (ret = ret, ctrmgd = ctrmgd, lnLmgd = lnLmgd$lnL, detVxmgd = lnLmgd$detVx, bicmgd = bicmgd, bicdiv = bicdiv) } # log-likelihood under the assumption of # p-dimensional multivariate normal distribution. # ignore.covar: ignore the covariance lnL <- function(x, centers, ignore.covar=T){ x <- as.matrix(x) p <- ncol(x) # p-dimensional multivariate n <- nrow(x) # sample size if (missing(centers)) stop("centers must be a number or a matrix") if (n <= 2) # few data return(list(lnL=NA, detVx=NA)) vx <- var(x) # var-co.var matrix # print(x) # for debug if (p == 1){ # x is vector invVx <- 1 / as.vector(vx) detVx <- as.vector(vx) }else{ if (ignore.covar){ invVx <- diag(1/diag(vx)) # inv. matrix when assuming diag. detVx <- prod(diag(vx)) # det. when assuming diag. }else{ invVx <- solve(vx) # inverse matrix of "vx" y <- chol(vx) # Cholesky decomposition detVx <- prod(diag(y)) # vx = t(y) %*% y, where y is triangular, # then, det(vx) = det(t(y)) * det(y) } } t1 <- -p/2 * 1.837877066 # 1.837... = log(2 * 3.1415...) t2 <- -log(detVx) / 2 xmu <- t(apply(x, 1, "-", centers)) # print(centers) # for debug # print(xmu) # for debug # s <- 0 # for (i in 1:n) # s <- s + t(xmu[i,]) %*% invVx %*% xmu[i,] if (p == 1){ s <- sum(xmu^2 * invVx) }else{ s <- sum(apply(xmu, 1, txInvVxX, invVx=invVx)) } t3 <- -s / 2 ll <- (t1 + t2) * n + as.numeric(t3) # log likelihood list(lnL=ll, detVx=detVx) } # function for calculation of # t(xmu[i,]) %*% invVx %*% xmu[i,] txInvVxX <- function(x, invVx){ t(x) %*% invVx %*% x }