|
|
(3 intermediate revisions by one other user not shown) |
Line 1: |
Line 1: |
| [[Category:R tool]]
| | #REDIRECT [[OpasnetUtils/Interpret]] |
| {{method|moderator=Jouni|stub=Yes}}
| |
| '''input.interp''' is an R function that interprets model inputs from a user-friendly format into explicit and exact mathematical format. The purpose is to make it easy for a user to give input without a need to worry about technical modelling details.
| |
| | |
| ==Question==
| |
| | |
| What should be a list of important user input formats, and how should they be interpreted?
| |
| | |
| ==Answer==
| |
| | |
| The basic feature is that if a text string can be converted to a meaningful numeric object, it will be. This function can be used when data is downloaded from [[Opasnet Base]]: if Result.Text contains this kind of numeric information, it is converted to numbers and fused with Result.
| |
| | |
| n is the number of iterations in the model. # is any numeric character in the text string.
| |
| | |
| {| {{prettytable}}
| |
| !Example!!Regular expression!!Interpretation !! Output in R
| |
| |----
| |
| | 12 000 ||# # || 12000. Text is interpreted as number if space removal makes it a number. || as.numeric(gsub(" ", "", Result.text))
| |
| |----
| |
| | -14,23 || -# || -14.23. Minus in the beginning of entry is interpreted as minus, not a sign for a range. ||{{attack|# |Not needed. See above.|--[[User:Jouni|Jouni]] 15:30, 16 April 2012 (EEST)}}
| |
| |----
| |
| | 50 - 125 ||# - # ||Uniform distribution between 50 and 125 || data.frame(iter=1:n, result=runif(n,50,125))
| |
| |----
| |
| | -12 345 - -23.56 || -# - -#|| Uniform distribution between -12345 and -23.56. ||
| |
| |----
| |
| | 1 - 50 ||# - # || Loguniform distribution between 1 and 50 (Lognormality is assumed if the ratio of upper to lower is > 100) ||
| |
| |----
| |
| | 3.1 ± 1.2 or 3.1 +- 1.2||# ± # or # +- # ||Normal distribution with mean 3.1 and SD 1.2 || data.frame(iter=1:n, result=rnorm(n,3.1,1.2))
| |
| |----
| |
| | 2.4 (1.8 - 3.0) || # (# - #) ||Normal distribution with mean 2.4 and 95 % confidence interval from 1.8 to 3.0 || data.frame(iter=1:n, result=rnorm(n,2.4,(3.0-1.8)/2/1.96))
| |
| |----
| |
| | 2.4 (2.0 - 3.2) || # (# - #) ||Lognormal distribution with mean 2.4 and 95 % confidence interval from 2.0 to 3.0. Lognormality is assumed if the difference from mean to upper limit is => 50 % greater than from mean to lower limit.||
| |
| |----
| |
| | 2;4;7 || || Each entry (2, 4, and 7 in this case) are equally likely to occur. Entries can also be text.||
| |
| |----
| |
| | (0,0.5,1) || (#,#,#) ||Triangular distribution||
| |
| |----
| |
| | * (in index, or explanatory, columns) || || The result applies to all locations of this index.|| With merge() function, this column is not used as a criterion when these rows are merged.
| |
| |}
| |
| | |
| How to actually make this happen in R?
| |
| # Make a temporary result ''temp'' by removing all spaces from Result.Text. Columns: ''Indices,Result.Result.Text,temp'' (Indices contains all explanatory columns.)
| |
| # Replace all "," with "."
| |
| # Check if there are parentheses "()". If yes, assume that they contain 95 % CI.
| |
| # Check if there are ranges "#-#".
| |
| # Divide the rows of the data.frame into two new data.frames with the same list of columns (''Indices,Result'').
| |
| #* If temp is a syntactically correct distribution, take the row to data.frame A and replace ''Result'' with ''temp''.
| |
| #* Otherwise, take the row to data.frame B and replace ''Result'' with ''Result.Text'' if that is not NA.
| |
| # Create a new data.frame with index Iter = 1:n.
| |
| # Make a random sample from each probability distribution in data.frame A using Iter.
| |
| # Merge the data.frame B with Iter.
| |
| # Join data.frames A and B with rbind(). Columns: ''Iter,Index,Result''.
| |
| | |
| ===Rcode===
| |
| | |
| <rcode name="answer">
| |
| library(triangle)
| |
| | |
| # Lognormal distribution parametrization functions
| |
| lmean <- function(parmean, parsd) {return(log(parmean)-log(1+(parsd^2)/(parmean^2))/2)}
| |
| lsd <- function(parmean, parsd) {return(log(1+(parsd^2)/(parmean^2)))}
| |
| | |
| # Actual interpretation function. Takes already pre-processed information and returns a distribution.
| |
| interpf <- function(
| |
| n,
| |
| res.char,
| |
| brackets.pos,
| |
| brackets.length,
| |
| minus,
| |
| minus.length,
| |
| minus.exists,
| |
| plusminus,
| |
| plusminus.length,
| |
| plusminus.exists,
| |
| doublePoint
| |
| ) {
| |
| | |
| if(doublePoint[1] > 0) {
| |
| tempArgs <- sort(as.numeric(unlist(strsplit(res.char, "\\:"))))
| |
| return(rtriangle(n,tempArgs[1],tempArgs[3],tempArgs[2]))
| |
| }
| |
| if(brackets.pos >= 0) {
| |
| minus.relevant <- unlist(minus)[(cumsum(c(0, minus.length)) + 1):cumsum(minus.length)]
| |
| n.minus.inside.brackets <- sum(minus.relevant > brackets.pos & minus.relevant < brackets.pos + brackets.length)
| |
| imean <- as.numeric(substr(res.char, 1, brackets.pos - 1))
| |
| if(n.minus.inside.brackets == 1) {
| |
| ici <- c(as.numeric(substr(res.char, brackets.pos + 1, minus.relevant[minus.relevant > brackets.pos] - 1)), as.numeric(substr(res.char,
| |
| minus.relevant[minus.relevant > brackets.pos] + 1, brackets.pos + brackets.length - 2)))
| |
| isd <- sum(abs(ici - imean) / 2) / qnorm(0.975)
| |
| if((ici[2] - imean) / (ici[1] - imean) < 1.5) {
| |
| return(rnorm(n, imean, isd))
| |
| } else {
| |
| return(out[[i]] <- rlnorm(n, lmean(imean, isd), lsd(imean, isd))) # menee vaarin koska isd on laskettu normaalijakaumalle
| |
| }
| |
| } else
| |
| if(n.minus.inside.brackets %in% c(2,3)) {
| |
| ici <- c(as.numeric(substr(res.char, brackets.pos + 1, minus.relevant[minus.relevant > brackets.pos][2] - 1)), as.numeric(substr(res.char,
| |
| minus.relevant[minus.relevant > brackets.pos][2] + 1, brackets.pos + brackets.length - 2)))
| |
| isd <- sum(abs(ici - imean) / 2) / qnorm(0.975)
| |
| return(rnorm(n, imean, isd))
| |
| }
| |
| warning(paste("Unable to interpret \"", res.char, "\"", sep = ""))
| |
| return(NA)
| |
| }
| |
| if(minus.exists) {
| |
| minus.relevant <- unlist(minus)[(cumsum(c(0, minus.length)) + 1):cumsum(minus.length)]
| |
| if(length(minus.relevant) == 1) {
| |
| if(as.numeric(substr(res.char, 1, minus.relevant - 1)) / as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char))) >= 1/100) {
| |
| return(runif(n, as.numeric(substr(res.char, 1, minus.relevant - 1)), as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char[i])))))
| |
| } else {
| |
| return(exp(runif(n, log(as.numeric(substr(res.char, 1, minus.relevant - 1))), log(as.numeric(substr(res.char, minus.relevant + 1, nchar(res.char)))))))
| |
| }
| |
| }
| |
| if(length(minus.relevant) %in% c(2,3)) {
| |
| return(runif(n, as.numeric(substr(res.char, 1, minus.relevant[2] - 1)), as.numeric(substr(res.char, minus.relevant[2] + 1, nchar(res.char)))))
| |
| }
| |
| }
| |
| if(plusminus.exists) {
| |
| return(rnorm(n, as.numeric(substr(res.char, 1, plusminus[1] - 1)), as.numeric(substr(res.char, plusminus[1] + 1, nchar(res.char)))))
| |
| }
| |
| if(sum(unlist(strsplit(res.char, ""))==";") > 0) {
| |
| return(sample(sapply(strsplit(res.char, ";"), as.numeric), N, replace = TRUE))
| |
| }
| |
| warning(paste("Unable to interpret \"", res.char, "\"", sep = ""))
| |
| return(NA)
| |
| }
| |
| | |
| # The next function processes character strings and loops the interpretation function.
| |
| input.interp <- function(res.char, n = 1000) {
| |
| res.char <- gsub(" ", "", res.char)
| |
| res.char <- gsub(",", ".", res.char)
| |
| plusminus <- gregexpr(paste("\\+-|", rawToChar(as.raw(177)), sep = ""), res.char) # saattaa osoittautua ongelmaksi enkoodauksen vuoksi
| |
| plusminus.length <- sapply(plusminus, length)
| |
| plusminus.exists <- unlist(plusminus)[cumsum(c(0, plusminus.length[-length(plusminus.length)])) + 1] > 0
| |
| minus <- gregexpr("-", res.char)
| |
| minus.length <- sapply(minus, length)
| |
| minus.exists <- unlist(minus)[cumsum(c(0, minus.length[-length(minus.length)])) + 1] > 0
| |
| brackets <- gregexpr("\\(.*\\)", res.char) # matches for brackets "(...)"
| |
| brackets.length <- as.numeric(unlist(sapply(brackets, attributes)[1,]))
| |
| brackets.pos <- unlist(brackets)
| |
| doublePoint <- gregexpr(":", res.char)
| |
| out <- list()
| |
| for(i in 1:length(res.char)) {
| |
| out[[i]] <- interpf(n, res.char[i], brackets.pos[i], brackets.length[i], minus[i], minus.length[i], minus.exists[i], plusminus[[i]],
| |
| plusminus.length[i], plusminus.exists[i],doublePoint[[i]])
| |
| }
| |
| out
| |
| }
| |
| | |
| # Assisting function for data.frame wrapper.
| |
| iter.f <- function(x) {
| |
| 1:x
| |
| }
| |
| | |
| # Data.frame wrapper for the functions.
| |
| interpret <- function(idata, rescol = "Result", N = 1000) {
| |
| | |
| temp <- input.interp(idata[, rescol], N)
| |
| temp.lengths <- sapply(temp, length)
| |
| out <- idata[rep(1:nrow(idata), times = temp.lengths),]
| |
| out$Interp.Result <- unlist(temp)
| |
| dim(temp.lengths) <- length(temp.lengths)
| |
| out$Iter<- c(apply(temp.lengths, 1, iter.f))
| |
| out
| |
| }
| |
| | |
| setGeneric("interpret")
| |
| | |
| setMethod(
| |
| f = "interpret",
| |
| signature = signature(idata = "character"),
| |
| definition = function(idata) {
| |
| if(!is.data.frame){
| |
| callGeneric(data.frame(Result = idata))
| |
| }
| |
| callGeneric(idata)
| |
| }
| |
| )
| |
| | |
| ===Independent or not?===
| |
| | |
| Specific character string can be converted to distributions. There are two ways to do this, with parameter independent = TRUE in the first case, and FALSE in the second one:
| |
| # The character strings are interpreted one row at a time, and each row is made an independent distribution.
| |
| # The character strings are treated as a factor. The levels of the factor are converted to distributions. Therefore, all rows that have the same character string will have the an identical (not independent) distribution.
| |