ReplicaX
From Opasnet
Jump to navigation
Jump to search
Moderator:Jouni (see all) |
|
Upload data
|
Question
How to generate an artificial data from a real data in such a way that
- individuals are no longer identifiable from the data,
- the marginal distributions are close to those of the original data,
- the correlation structure is close to that of the original data?
Answer
Rcode ReplicaX by Juha Karvanen does exactly that.
⇤--arg1500: . Table data cannot be pasted to the code for some reason. --Jouni (talk) 11:33, 18 November 2018 (UTC) (type: truth; paradigms: science: attack)
# name:dat|description:Copy your data table here|type:table| Causes an error. library(OpasnetUtils) library(Lmoments) library(bayesm) library(MASS) library(plyr) library(compiler) objects.latest("Op_en6248", code_name = "initiate") # Fetch the ReplicaX functions enableJIT(3) set.seed(20131103) #to reproduce the results #Generating replica data for(i in (1:ncol(dat))[vartype == 'continuous']) { colnames(dat)[i] <- levels(dat[[i]])[dat[[1 , i]]] dat[[i]] <- as.numeric(levels(dat[[i]])[dat[[i]]]) } dat <- dat[2:nrow(dat) , ] replica <- ReplicaX(dat, vartype, k = k, osigma = osigma) objects.put(replica) cat("The replica table is saved. Save the key of the model run to retrieve your data.\n") if(verbose) { cat("Copy the replica data from here.\n") oprint(replica) } else { cat("Your replica data looks like this:\n") oprint(head(replica)) } |
Show example data and analysis (Making a new run takes ca. 4 minutes)
#Fil: http://www.tilastotiede.fi/ReplicaX/example_MONICA.r #Author: Juha Karvanen (juha.karvanen at iki.fi) # #Summary: This code demonstrates the creation of data replicas with ReplicaX R functions. #The R functions needed are available as http://www.tilastotiede.fi/ReplicaX/ReplicaX_functions.r #For information see http://www.tilastotiede.fi/ReplicaX/ReplicaX_esitys.pdf (in Finnish) # #The data used is a published sample of WHO MONICA data. #Before running the code, you should download the data from #http://www.thl.fi/publications/monica/monograph_cd/data/form04_3.zip # #The data documentation can be found at #http://www.thl.fi/publications/monica/monograph_cd/formats/form048.htm #http://www.thl.fi/publications/monica/monograph_cd/formats/survey.htm # #For more information on the MONICA project see http://www.thl.fi/monica/ library(OpasnetUtils) objects.latest("Op_en6248", code_name = "initiate") # Fetch the ReplicaX functions # Fetch the Monica data monica <- opasnet.csv(filename = "f/f7/Monica.zip", wiki = "opasnet_en", unzip = "Monica.csv", sep = ",", header = TRUE) library(Lmoments) library(bayesm) library(MASS) library(plyr) library(compiler) enableJIT(3) print("ReplicaX demonstration with the MONICA data. Running the code takes a couple minutes.") set.seed(20131103) #to reproduce the results selectedcolnames <- c("marit","edlevel","school","height","weight","waist","hip") monica2 <- monica[,selectedcolnames] vartype <- c("discrete","discrete","discrete","continuous","continuous","continuous","continuous") #Generating replica data monica2_r <- ReplicaX(monica2,vartype,k=5,osigma=0.5) #Rounding values to the original precision (Depending on the use case, this step can be skipped.) monica2_r$height <- round(monica2_r$height) monica2_r$weight <- round(monica2_r$weight,digits=1) monica2_r$waist <- round(monica2_r$waist,digits=1) monica2_r$hip <- round(monica2_r$hip,digits=1) oprint(head(monica2)) print("Summary statistics:") print("Original data:") oprint(summary(monica2)) print("Replica data:") oprint(summary(monica2_r)) print("The original dataset has 8 individuals with marital status '5, (other)'. These individuals cannot be identified from the replica dataset.") marit5ind <- (monica2$marit==5) print("Original data:") oprint(monica2[marit5ind,]) print("Corresponding rows in the replica data:") oprint(monica2_r[marit5ind,]) print("Cross-tabulation") print("Original data:") oprint(table(monica2$marit,monica2$edlevel)) print("Replica data:") oprint(table(monica2_r$marit,monica2_r$edlevel)) print("Means and standard deviations by the marital status:") print("Original data:") oprint(ddply(monica2,.(marit),summarize,meanweight=mean(weight,na.rm=T),sdweight=sd(weight,na.rm=T), n=sum(is.finite(weight)),se=sdweight/sqrt(n))) print("Replica data:") oprint(ddply(monica2_r,.(marit),summarize,meanweight=mean(weight,na.rm=T),sdweight=sd(weight,na.rm=T), n=sum(is.finite(weight)),se=sdweight/sqrt(n))) #Scatter plots with the original and the replica data plot(monica2$waist,monica2$weight,xlab="Waist circumference (cm)",ylab="Weight (kg)") points(monica2_r$waist,monica2_r$weight,col="red") legend("topleft",c("original data","replica data"),pch=c(1,1),col=c("black","red")) plot(monica2$height,monica2$weight,xlab="Height (cm)",ylab="Weight (kg)") points(monica2_r$height,monica2_r$weight,col="red") legend("topleft",c("original data","replica data"),pch=c(1,1),col=c("black","red")) plot(monica2$weight/(monica2$height/100)^2,monica2$waist/monica2$hip, xlab="Body mass index",ylab="Waist/hip ratio") points(monica2_r$weight/(monica2_r$height/100)^2,monica2_r$waist/monica2_r$hip,col="red") legend("topright",c("original data","replica data"),pch=c(1,1),col=c("black","red")) print("Correlation coefficients:") print("Original data:") oprint(cor(monica2[,4:7],use="complete.obs")) print("Replica data:") oprint(cor(monica2_r[,4:7],use="complete.obs")) print("End of demonstration.") |
Rationale
Calculations
Replica R function by Juha Karvanen does exactly that.
library(OpasnetUtils) ReplicaX <- function(x,vartype,k=5,osigma=0.5) { y <- replica_discrete(x,vartype,k=5) y_continuous <- replica_continuous(x[,vartype=="continuous"],osigma=osigma) y[,vartype=="continuous"] <- y_continuous return(y) } replica_discrete <- function(x,vartype,k=5) { y <- x n <- nrow(x) distmat <- rowwise_similarity(x,x,vartype=vartype) neighbors <- t(apply(distmat,2,FUN=which.maxk,k=k+1)[-1,]) for(i in 1:n) { y[i,] <- new_obs2(x[neighbors[i,],],vartype=vartype) } return(y) } new_obs2 <- function(xx,vartype) { xnew <- xx[1,] k <- nrow(xx) for(j in 1:ncol(xnew)) { if(vartype[j]=="continuous") { finiteind <- is.finite(xx[,j]) kfinite <- sum(finiteind) if(kfinite>1) { xnew[j] <- sum(rdirichlet(rep(1,kfinite))*xx[finiteind,j]) } else { xnew[j] <- NA } } else { ind <- as.logical(rmultinom(1,1,prob=rep(1/k,k))) xnew[j] <- xx[ind,j] } } return(xnew) } which.maxk <- function(x,k) { return(order(x,decreasing=T)[1:k]) } observation_similarity <- function(v1,d2,vartype,variance) { #v1 one row dataframe, d2 dataframe p <- ncol(v1) n <- nrow(d2) score <- rep(0,n) for(j in 1:p) { if(vartype[j]=="continuous") { score <- score + pmax(0,1-(v1[,j]-d2[,j])^2/variance[j],na.rm=T) } else { score <- score + as.numeric(as.character(d2[,j])==as.character(v1[,j])) } } return(score) } navar <- function(x) { p <- ncol(x) v <- rep(NA,p) for(j in 1:p) { if(is.numeric(x[,j])) v[j] <- var(x[,j]) } return(v) } rowwise_similarity <- function(x,y,vartype) { n <- nrow(x) similmat <- matrix(NA,nrow=n,ncol=nrow(y)) variance <- navar(y) for(j in 1:n) { similmat[j,] <- observation_similarity(x[j,],y,vartype=vartype,variance=variance) } return(similmat) } kernelCDF <- function(x,n=512,cut=3) { bw <- density(x,give.Rkern=T,bw="SJ") xp <- seq(min(x)-cut*bw,max(x)+cut*bw,length.out=n) A <- outer(xp,x,pnorm,sd=bw)/length(x) cdf <- rowSums(A) return(data.frame(x=xp,cdf=cdf)) } replica_continuous <- function(x,osigma,umin=0.00000001,umax=0.99999999,kernelweight=0.9) { n <- dim(x)[1] p <- dim(x)[2] if(length(osigma)==1) osigma<-rep(osigma,p) u <- matrix(NA,nrow=n,ncol=p) uparam <- matrix(NA,nrow=n,ncol=p) ukernel <- matrix(NA,nrow=n,ncol=p) y <- x yparam <- x ykernel <- x param <- list() bestpdf <- list() kcdf <- list() ind <- 1:n #Modeling of marginal distributions for(j in 1:p) { xf <- na.omit(x[,j]) ind_finite <- is.finite(x[,j]) if(kernelweight>0 & kernelweight<1) { param[[j]] <- data2normpoly4(xf) uparam[ind_finite,j] <- pnormpoly(x[ind_finite,j],param[[j]]) kcdf[[j]] <- kernelCDF(xf) pspl <- splinefun(kcdf[[j]]$x,kcdf[[j]]$cdf,method="monoH.FC") ukernel[ind_finite,j] <- pspl(x[ind_finite,j]) u[ind_finite,j] <- (1-kernelweight)*uparam[ind_finite,j] + kernelweight*ukernel[ind_finite,j] u[!ind_finite,j] <- NA } if(kernelweight==1) { kcdf[[j]] <- kernelCDF(xf) ukernel[ind_finite,j] <- kcdf[[j]]$cdf[ind_finite] u[ind_finite,j] <- ukernel[ind_finite,j] u[!ind_finite,j] <- NA } if(kernelweight==0) { param[[j]] <- data2normpoly4(xf) uparam[ind_finite,j] <- pnormpoly(x[ind_finite,j],param[[j]])[ind] u[ind_finite,j] <- uparam[ind_finite,j] u[!ind_finite,j] <- NA } u[ind_finite,j] <- pmin(umax,pmax(u[ind_finite,j],umin)) } #Generation of correlated noise in Gaussian space unorm <- qnorm(u) if(is.null(osigma)) { dr <- dist(unorm[sample(1:n,1000),]) osigma <- quantile(as.vector(dr),0.001) } normcor <- cor(unorm,use="complete.obs") epsilon <- mvrnorm(n,mu=rep(0,p),Sigma=diag(osigma)%*%normcor%*%diag(osigma)) #Generation of new observations for(j in 1:p) { noisyu <- unorm[,j]+epsilon[,j] ind_finite <- is.finite(x[,j]) noisyu[ind_finite] <- noisyu[ind_finite]/sd(noisyu[ind_finite]) if(kernelweight>0 & kernelweight<1) { yparam[ind_finite,j] <- qnormpoly(pmin(umax,pmax(umin,pnorm(noisyu[ind_finite]))), param=param[[j]]) qspl <- splinefun(kcdf[[j]]$cdf,kcdf[[j]]$x,method="monoH.FC") ykernel[ind_finite,j] <- qspl(pnorm(noisyu[ind_finite])) y[ind_finite,j] <- (1-kernelweight)*yparam[ind_finite,j] + kernelweight*ykernel[ind_finite,j] } if(kernelweight==0) { yparam[ind_finite,j] <- qnormpoly(pmin(umax,pmax(umin,pnorm(noisyu[ind_finite]))), param=param[[j]]) y[ind_finite,j] <- yparam[ind_finite,j] } if(kernelweight==1) { qspl <- splinefun(kcdf[[j]]$cdf,kcdf[[j]]$x,method="monoH.FC") ykernel[ind_finite,j] <- qspl(pnorm(noisyu[ind_finite])) y[ind_finite,j] <- ykernel[ind_finite,j] } } names(y) <- names(x) return(y) } objects.store( ReplicaX, replica_discrete, new_obs2, which.maxk, observation_similarity, navar, rowwise_similarity, kernelCDF, replica_continuous ) cat("Saved objects ReplicaX, replica_discrete, new_obs2, which.maxk, observation_similarity, navar, rowwise_similarity, kernelCDF, replica_continuous. Remember to load packages Lmoments, bayesm, MASS, plyr, and compiler before using ReplicaX.\n") |
- Description of ReplicaX in the Apps4Finland contest (in Finnish).
See also
- Monica project: data File:Monica.zip
- Portal:Data
- op_fi:Apps4Finland
Keywords
References
Retrieved from "https://en.opasnet.org/index.php?title=ReplicaX&oldid=42682"