Difference between revisions of "ReplicaX"

From Opasnet
Jump to: navigation, search
(code improved, but packages are missing)
(Answer: rcode updated)
Line 12: Line 12:
 
==Answer==
 
==Answer==
  
Replica R function by Juha Karvanen does exactly that.
+
Rcode ReplicaX by Juha Karvanen does exactly that.
<rcode>
+
 
 +
<rcode label="Run an example">
 +
 
 +
#Fil: http://www.tilastotiede.fi/ReplicaX/example_MONICA.r
 +
#Author: Juha Karvanen (juha.karvanen at iki.fi)
 +
#
 +
#Summary: This code demonstrates the creation of data replicas with ReplicaX R functions.
 +
#The R functions needed are available as http://www.tilastotiede.fi/ReplicaX/ReplicaX_functions.r
 +
#For information see http://www.tilastotiede.fi/ReplicaX/ReplicaX_esitys.pdf (in Finnish)
 +
#
 +
#The data used is a published sample of WHO MONICA data.
 +
#Before running the code, you should download the data from
 +
#http://www.thl.fi/publications/monica/monograph_cd/data/form04_3.zip
 +
#
 +
#The data documentation can be found at
 +
#http://www.thl.fi/publications/monica/monograph_cd/formats/form048.htm
 +
#http://www.thl.fi/publications/monica/monograph_cd/formats/survey.htm
 +
#
 +
#For more information on the MONICA project see http://www.thl.fi/monica/
  
 
library(OpasnetUtils)
 
library(OpasnetUtils)
  
 +
objects.latest("Op_en6248", code_name = "initiate") # Fetch the ReplicaX functions
 +
 +
# Fetch the Monica data
 
monica <- opasnet.csv(filename = "f/f7/Monica.zip", wiki = "opasnet_en", unzip = "Monica.csv", sep = ",", header = TRUE)
 
monica <- opasnet.csv(filename = "f/f7/Monica.zip", wiki = "opasnet_en", unzip = "Monica.csv", sep = ",", header = TRUE)
 +
 
oprint(head(monica))
 
oprint(head(monica))
  
Line 27: Line 49:
  
 
enableJIT(3)
 
enableJIT(3)
 +
 +
print("ReplicaX demonstration with the MONICA data. Running the code takes a couple minutes.")
 +
 +
set.seed(20131103) #to reproduce the results
 +
 +
selectedcolnames <- c("marit","edlevel","school","height","weight","waist","hip")
 +
 +
monica2 <- monica[,selectedcolnames]
 +
 +
vartype <- c("discrete","discrete","discrete","continuous","continuous","continuous","continuous")
 +
 +
#Generating replica data
 +
monica2_r <- ReplicaX(monica2,vartype,k=5,osigma=0.5)
 +
 +
#Rounding values to the original precision (Depending on the use case, this step can be skipped.)
 +
monica2_r$height <- round(monica2_r$height)
 +
monica2_r$weight <- round(monica2_r$weight,digits=1)
 +
monica2_r$waist <- round(monica2_r$waist,digits=1)
 +
monica2_r$hip <- round(monica2_r$hip,digits=1)
 +
 +
print("Summary statistics:")
 +
print("Original data:")
 +
print(summary(monica2))
 +
print("Replica data:")
 +
print(summary(monica2_r))
 +
 +
print("The original dataset has 8 individuals with marital status '5, (other)'.
 +
      These individuals cannot be identified from the replica dataset.")
 +
marit5ind <- (monica2$marit==5)
 +
print("Original data:")
 +
print(monica2[marit5ind,])
 +
print("Corresponding rows in the replica data:")
 +
print(monica2_r[marit5ind,])
 +
 +
print("Cross-tabulation")
 +
print("Original data:")
 +
print(table(monica2$marit,monica2$edlevel))
 +
print("Replica data:")
 +
print(table(monica2_r$marit,monica2_r$edlevel))
 +
 +
print("Means and standard deviations by the marital status:")
 +
print("Original data:")
 +
print(ddply(monica2,.(marit),summarize,meanweight=mean(weight,na.rm=T),sdweight=sd(weight,na.rm=T),
 +
            n=sum(is.finite(weight)),se=sdweight/sqrt(n)))
 +
print("Replica data:")
 +
print(ddply(monica2_r,.(marit),summarize,meanweight=mean(weight,na.rm=T),sdweight=sd(weight,na.rm=T),
 +
            n=sum(is.finite(weight)),se=sdweight/sqrt(n)))
 +
 +
 +
#Scatter plots with the original and the replica data
 +
plot(monica2$waist,monica2$weight,xlab="Waist circumference (cm)",ylab="Weight (kg)")
 +
points(monica2_r$waist,monica2_r$weight,col="red")
 +
legend("topleft",c("original data","replica data"),pch=c(1,1),col=c("black","red"))
 +
 +
plot(monica2$height,monica2$weight,xlab="Height (cm)",ylab="Weight (kg)")
 +
points(monica2_r$height,monica2_r$weight,col="red")
 +
legend("topleft",c("original data","replica data"),pch=c(1,1),col=c("black","red"))
 +
 +
plot(monica2$weight/(monica2$height/100)^2,monica2$waist/monica2$hip,
 +
    xlab="Body mass index",ylab="Waist/hip ratio")
 +
points(monica2_r$weight/(monica2_r$height/100)^2,monica2_r$waist/monica2_r$hip,col="red")
 +
legend("topright",c("original data","replica data"),pch=c(1,1),col=c("black","red"))
 +
 +
 +
print("Correlation coefficients:")
 +
print("Original data:")
 +
print(cor(monica2[,4:7],use="complete.obs"))
 +
print("Replica data:")
 +
print(cor(monica2_r[,4:7],use="complete.obs"))
 +
 +
print("End of demonstration.")
 +
 +
</rcode>
 +
 +
==Rationale==
 +
 +
===Functions===
 +
 +
Replica R function by Juha Karvanen does exactly that.
 +
<rcode embed=1 name="initiate" label="Initiate functions">
 +
 +
library(OpasnetUtils)
  
 
ReplicaX <- function(x,vartype,k=5,osigma=0.5)
 
ReplicaX <- function(x,vartype,k=5,osigma=0.5)
Line 118: Line 222:
 
   return(similmat)
 
   return(similmat)
 
}   
 
}   
 
  
 
kernelCDF <- function(x,n=512,cut=3)
 
kernelCDF <- function(x,n=512,cut=3)
Line 220: Line 323:
 
}
 
}
  
 
+
objects.store(
#Fil: http://www.tilastotiede.fi/ReplicaX/example_MONICA.r
+
ReplicaX,  
#Author: Juha Karvanen (juha.karvanen at iki.fi)
+
replica_discrete,  
#
+
new_obs2,  
#Summary: This code demonstrates the creation of data replicas with ReplicaX R functions.
+
which.maxk,  
#The R functions needed are available as http://www.tilastotiede.fi/ReplicaX/ReplicaX_functions.r
+
observation_similarity,  
#For information see http://www.tilastotiede.fi/ReplicaX/ReplicaX_esitys.pdf (in Finnish)
+
navar,  
#
+
rowwise_similarity,  
#The data used is a published sample of WHO MONICA data.
+
kernelCDF,  
#Before running the code, you should download the data from
+
replica_continuous
#http://www.thl.fi/publications/monica/monograph_cd/data/form04_3.zip
 
#
 
#The data documentation can be found at
 
#http://www.thl.fi/publications/monica/monograph_cd/formats/form048.htm
 
#http://www.thl.fi/publications/monica/monograph_cd/formats/survey.htm
 
#
 
#For more information on the MONICA project see http://www.thl.fi/monica/
 
 
 
 
 
print("ReplicaX demonstration with the MONICA data. Running the code takes a couple minutes.")
 
 
 
#rm(list=ls())
 
set.seed(20131103) #to reproduce the results
 
#path <- "/home/jutakarv/Documents/replica_gen/"
 
#path <- "/home/q/tiede/replica_gen/"
 
#path <- paste(getwd(),"/",sep="")
 
#so#urce(paste(path,"ReplicaX_functions.r",sep=""))
 
 
 
#path <- "//cesium/jtue$/_Downloads/form04_3/"
 
#if(!fi#le.exists(paste(path,"form04_3.txt",sep="")))
 
#{ 
 
#  stop("Data fi#le form04_3.txt does not exist in the current path.
 
#  Please download it from http://www.thl.fi/publications/monica/monograph_cd/data/form04_3.zip
 
#  or change the path definition")
 
#}
 
widths <- c(
 
2,1,2,2,6,1,3,8,8,1,1,1,1,2,1,3,1,1,4,1,3,2,1,3,1,3,2,1,1,1,1,1,1,1,1,1,2,1,1,3,3,2,3,3,2,1,2,2,4,2,3,3,8,3,3,8,3,4,2,3,4,
 
4,4,2,1,2,1,1,1,1,1,4,4,4,4
 
)
 
 
 
colnames <- c(
 
"form","versn","centre","runit","serial","numsur","samunit","dexam","mbirth","agegr","sex","marit","edlevel",
 
"school","cigs","numcigs","daycigs","evercig","stop","iflyear","maxcigs","cigage","cigarsm","cigar","pipesm","pipe",
 
"othersm","hibp","drugs","bprecd","hich","chdt","chrx","chrecd","asp","menop","agem","horm","pill","syst1",
 
"diast1","rz1","syst2","diast2","rz2","cuff","arm","bpcoder","timebp","rtemp","chol","choldl","dchol","hdl",
 
"hdldl","dhdl","scn","cotin","carbmon","height","weight","waist","hip","whcoder","oversion","eage","eageg",
 
"cohort1","cohort2","edtert1","edtert2","systm","diastm","chola","hdla"
 
 
)
 
)
  
#monica <- re#ad.fwf(fi#le=paste(path,"form04_3.txt",sep=""),widths=widths,col.names=colnames)
+
cat("Saved objects ReplicaX,  
 
+
replica_discrete,  
selectedcolnames <- c("marit","edlevel","school","height","weight","waist","hip")
+
new_obs2,  
monica2 <- monica[,selectedcolnames]
+
which.maxk,  
vartype <- c("discrete","discrete","discrete","continuous","continuous","continuous","continuous")
+
observation_similarity,  
 
+
navar,  
#decoding  missing values
+
rowwise_similarity,  
monica2$height[monica2$height==999] <- NA
+
kernelCDF,  
monica2$weight[monica2$weight==9999] <- NA
+
replica_continuous. Remember to load packages Lmoments, bayesm, MASS, plyr, and compiler before using ReplicaX.\n")
monica2$waist[monica2$waist==9999] <- NA
 
monica2$hip[monica2$hip==9999] <- NA
 
monica2$school[monica2$school == 99] <- NA
 
 
 
#unit transformations
 
monica2$weight <- monica2$weight/10 #transform to kg
 
monica2$waist <- monica2$waist/10 #transform to cm
 
monica2$hip <- monica2$hip/10 #transform to cm
 
 
 
#Generatin replica data
 
monica2_r <- ReplicaX(monica2,vartype,k=5,osigma=0.5)
 
 
 
#Rounding values to the original precision (Depending on the use case, this step can be skipped.)
 
monica2_r$height <- round(monica2_r$height)
 
monica2_r$weight <- round(monica2_r$weight,digits=1)
 
monica2_r$waist <- round(monica2_r$waist,digits=1)
 
monica2_r$hip <- round(monica2_r$hip,digits=1)
 
 
 
print("Summary statistics:")
 
print("Original data:")
 
print(summary(monica2))
 
print("Replica data:")
 
print(summary(monica2_r))
 
 
 
print("The original dataset has 8 individuals with marital status '5, (other)'.
 
      These individuals cannot be identified from the replica dataset.")
 
marit5ind <- (monica2$marit==5)
 
print("Original data:")
 
print(monica2[marit5ind,])
 
print("Corresponding rows in the replica data:")
 
print(monica2_r[marit5ind,])
 
 
 
print("Cross-tabulation")
 
print("Original data:")
 
print(table(monica2$marit,monica2$edlevel))
 
print("Replica data:")
 
print(table(monica2_r$marit,monica2_r$edlevel))
 
 
 
print("Means and standard deviations by the marital status:")
 
print("Original data:")
 
print(ddply(monica2,.(marit),summarize,meanweight=mean(weight,na.rm=T),sdweight=sd(weight,na.rm=T),
 
            n=sum(is.finite(weight)),se=sdweight/sqrt(n)))
 
print("Replica data:")
 
print(ddply(monica2_r,.(marit),summarize,meanweight=mean(weight,na.rm=T),sdweight=sd(weight,na.rm=T),
 
            n=sum(is.finite(weight)),se=sdweight/sqrt(n)))
 
 
 
 
 
#Scatter plots with the original and the replica data
 
plot(monica2$waist,monica2$weight,xlab="Waist circumference (cm)",ylab="Weight (kg)")
 
points(monica2_r$waist,monica2_r$weight,col="red")
 
legend("topleft",c("original data","replica data"),pch=c(1,1),col=c("black","red"))
 
 
 
plot(monica2$height,monica2$weight,xlab="Height (cm)",ylab="Weight (kg)")
 
points(monica2_r$height,monica2_r$weight,col="red")
 
legend("topleft",c("original data","replica data"),pch=c(1,1),col=c("black","red"))
 
 
 
plot(monica2$weight/(monica2$height/100)^2,monica2$waist/monica2$hip,
 
    xlab="Body mass index",ylab="Waist/hip ratio")
 
points(monica2_r$weight/(monica2_r$height/100)^2,monica2_r$waist/monica2_r$hip,col="red")
 
legend("topright",c("original data","replica data"),pch=c(1,1),col=c("black","red"))
 
 
 
 
 
print("Correlation coefficients:")
 
print("Original data:")
 
print(cor(monica2[,4:7],use="complete.obs"))
 
print("Replica data:")
 
print(cor(monica2_r[,4:7],use="complete.obs"))
 
 
 
print("End of demonstration.")
 
  
 
</rcode>
 
</rcode>
 
  
 
== Rationale ==
 
== Rationale ==

Revision as of 18:01, 8 December 2013



Question

How to generate an artificial data from a real data in such a way that

  • individuals are no longer identifiable from the data,
  • the marginal distributions are close to those of the original data,
  • the correlation structure is close to that of the original data?

Answer

Rcode ReplicaX by Juha Karvanen does exactly that.

+ Show code

Rationale

Functions

Replica R function by Juha Karvanen does exactly that.

+ Show code

Rationale

See also

Keywords

References