# $Id: rfImputeSmsupvsd.R,v 1.12 2014/07/14 04:50:26 tunenori Exp tunenori $ # Description: # semi-supervised Data Imputation using the proximity from Random Forests. # # Usage: # rfImpute.smsupvsd(x, iter=10) # # Arguments: # x: An semi-supervised data frame or matrix, some containing 'NA's. # Response vector is not needed. # iter: Number of iterations needed to run the imputation. # proximity: Sould proximity measure among the row be calculated? # # Details: # The algorithm starts rough imputation for missing response variable, # next, we replace the predictor running Andy Liaw's `rfImpute', # finally, repeating the replacement of the variables by using the # proximities between cases. # `RandomForest' cannot be allowed 'NA's for response variable. # `fImpute' has not (yet) been implemented for the unsupervised # and semi-supervised case. # # Value: # A data frame or matrix containing the completed data matrix, where # 'NA's are imputed by using the proximity from randomForest. # # See Also: # 'rfImpute', 'na.roughfix' # # Example: # # library(randomForest) # source("./rfImputeUnsupvsd.R") # # data(iris) # iris.na <- iris # set.seed(111) # ## artificially drop some data values. # for (i in 1:5) iris.na[sample(150, sample(20)), i] <- NA # x <- iris.na[,-5] # predictor: `Species' # y <- iris.na[,5] # response variable: `Species' # set.seed(222) # irisImput.smsupvsd <- rfImput.smsupvsd(x, y) # # $Id: rfImputeSmsupvsd.R,v 1.12 2014/07/14 04:50:26 tunenori Exp tunenori $ rfImput.smsupvsd <- function (x, y, iter=1, proximity=F){ if (is.null(ncol(x))){ stop("ncol of x is null: (x) should be matrix or data.frame-matrix") }else if (!is.null(ncol(y))){ stop("ncol of y is not vector: (y) should be vector or data.frame-vector") }else if (nrow(x) != length(y)){ stop("nrow(x) differs from length(y)") }else if (iter <= 0){ stop("iter: zero or negative") } y.missing <- is.na(y) if (sum(y.missing) == 0){ stop("NOT semi-supervised data: no missing data for y") } x.roughfix <- na.roughfix2(x) x.impute <- x ntree <- as.integer( min(300, 300 * 1000 / nrow(x[!y.missing,]))) if (any(is.na(x[!y.missing,]))){ if (proximity){ x.rf <- randomForest( x.roughfix[!y.missing,], y[!y.missing], proximity=proximity, ntree=ntree) x.prox <- x.rf$proximity for (i in 1:ncol(x)){ x.impute[!y.missing,i] <- nafix.prox(x[!y.missing,i], x.roughfix[!y.missing,i], x.prox) } rf.labeled <- cbind(y[!y.missing], x.impute[!y.missing,]) }else{ rf.labeled <- rfImpute(x[!y.missing,], factor(y[!y.missing]), ntree=ntree) # factor is necessary to drop invalid labels x.rf <- randomForest( rf.labeled[,-1], y[!y.missing]) } }else{ x.rf <- randomForest( x[!y.missing,] ,y[!y.missing]) rf.labeled <- cbind(y[!y.missing], x[!y.missing,]) } y.unlabeled <- predict(x.rf, x.roughfix[y.missing,]) if (0){ # imputation of x for unlabeled is not needed ntree <- as.integer( min(300, 300 * 1000 / nrow(x[y.missing,]))) rf.unlabeled <- rfImpute(x[y.missing,], factor(y.unlabeled), ntree=ntree) } x.sav <- x y.sav <- y y.sav[y.missing] <- y.unlabeled if (0){ x.sav[y.missing,] <- rf.unlabeled[,-1] }else{ x.sav[y.missing,] <- x.roughfix[y.missing,] } x.sav[!y.missing,] <- rf.labeled[,-1] n.label=length(y[!y.missing]) n.unlabel=length(y[y.missing]) x.rf.label <<- x.rf return(list(impdata=cbind(y.sav, x.sav), rf.label=x.rf, n.label=n.label, n.unlabel=n.unlabel)) } # Description: # Impute or revise NA elements using the data proximity. # Arguments: # na.vales: data vector that includes NA; unchanged. # rough.vales: rough data vector to be replaced; NAs cannot include. # x.prox: data proximity matrix; each element is positive and <= 1. # y.prob: estimation prob. of the responses(y) #NEW nafix.prox.prob <- function (na.vales, rough.vales, x.prox, y.prob){ if (length(na.vales) != length(rough.vales)) stop("'na.vales' and 'rough.vales' must have the same length") else if (length(rough.vales) != ncol(x.prox)) stop("'rough.vales' and 'x.prox' size incorrect") else if (ncol(x.prox) != length(y.prob)) stop("'x.prox' and 'y.prob' size incorrect") # NA imputation ONLY for NA data na.list <- which(is.na(na.vales)) replaced.vales <- rough.vales for (i in 1:length(na.list)){ j <- na.list[i] x.prox[j,j] <- 0 # ignore the weight of the data to be imputed. replaced.vales[j] <- kWeighted.mean (rough.vales, x.prox[,j] * y.prob); } return(replaced.vales) } # Description: # Faster than na.raghfix(). # References: # https://stat.ethz.ch/pipermail/r-help/2010-July/244390.html na.roughfix2 <- function (object, ...) { res <- lapply(object, roughfix) structure(res, class = "data.frame", row.names = seq_len(nrow(object))) } roughfix <- function(x) { missing <- is.na(x) if (!any(missing)) return(x) if (is.numeric(x)) { x[missing] <- median.default(x[!missing]) } else if (is.factor(x)) { freq <- table(x) x[missing] <- names(freq)[which.max(freq)] } else { stop("na.roughfix only works for numeric or factor") } x }