Opasnet Base Connection for R: Difference between revisions
Jump to navigation
Jump to search
m (→Functions) |
No edit summary |
||
Line 112: | Line 112: | ||
**Varying numbers of iterations between cells not supported. | **Varying numbers of iterations between cells not supported. | ||
**Probabilistic data must also follow a certain sorting order (indices first, obs last). | **Probabilistic data must also follow a certain sorting order (indices first, obs last). | ||
*Uploading indices and locations relies on updating a dummy column when a duplicate is encountered. | *Uploading indices and locations relies on updating a dummy column when a duplicate is encountered. | ||
**Those dummy columns are "newest" and "roww" for indices and locations respectively, which are obsolete anyway. | **Those dummy columns are "newest" and "roww" for indices and locations respectively, which are obsolete anyway. | ||
Line 119: | Line 118: | ||
===Functions=== | ===Functions=== | ||
op_baseWrite <- function(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL) { | op_baseWrite <- function(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL, acttype = NULL) { | ||
#Coerce input into a data frame if it isn't one already; get rid of empty cells | #Coerce input into a data frame if it isn't one already; get rid of empty cells | ||
Line 162: | Line 161: | ||
if (is.null(who)==TRUE) if (interactive()) {who <- readline(paste("What is the name of the uploader?", "\n", sep = "")) | if (is.null(who)==TRUE) if (interactive()) {who <- readline(paste("What is the name of the uploader?", "\n", sep = "")) | ||
} else stop("uploader name not given") | } else stop("uploader name not given") | ||
<nowiki>sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES ( | series_id <- sqlQuery(db, paste("SELECT series_id FROM actobj WHERE obj_id = ", obj_id, " ORDER BY series_id DESC LIMIT 1", | ||
sep = ""))[1,1] | |||
if (is.na(series_id)==FALSE) {if (is.null(acttype)==TRUE) if (interactive()) {acttype <- readline(paste("What type of upload", | |||
" is this? 4 - new data to replace any existing, 5 - new data to be appended to existing data (must have the same", | |||
" indices).", "\n", sep = "")) | |||
} else acctype <- 4 | |||
} else acctype <- 4 | |||
if (acttype != 4|5) stop ("proper acttype not given") | |||
<nowiki>sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES (', acttype, ',"', who, '","R upload")', sep = ''))</nowiki> | |||
act_id <- sqlQuery(db, "SELECT id FROM act ORDER BY id DESC LIMIT 1")[1,1] | act_id <- sqlQuery(db, "SELECT id FROM act ORDER BY id DESC LIMIT 1")[1,1] | ||
sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id) VALUES (', paste(act_id, obj_id, | if (acttype == 4) series_id <- act_id | ||
sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id) VALUES (', paste(act_id, obj_id, series_id, sep = ','), | |||
<nowiki>')', sep = ''))</nowiki> | <nowiki>')', sep = ''))</nowiki> | ||
<nowiki>actobj_id <- sqlQuery(db, paste('SELECT id FROM actobj WHERE act_id = ', act_id, sep = ''))[1,1]</nowiki> | <nowiki>actobj_id <- sqlQuery(db, paste('SELECT id FROM actobj WHERE act_id = ', act_id, sep = ''))[1,1]</nowiki> | ||
Line 319: | Line 327: | ||
====Usage==== | ====Usage==== | ||
op_baseWrite(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL) | op_baseWrite(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL, acttype = NULL) | ||
*Db and input must be defined, the rest if not defined is prompted for by the function as needed. | *Db and input must be defined, the rest if not defined is prompted for by the function as needed. |
Revision as of 07:33, 12 October 2010
Moderator:Teemu R (see all) |
|
Upload data
|
Code for R for the purpose of interacting with the Opasnet Base is collected on this page. To use it, copy paste the code to R; this defines the functions, after which they can be called for in R.
Package dependencies
These packages are required for most of the code to work. Load these first (copy-pasteable).
library(utils) library(RODBC)
Setup
Establishes the connection to the database. Copy paste first.
db <- odbcConnect("opasnet_base")
- Assuming "opasnet_base" is a correctly defined DSN.
- Note that uploading requires writing permissions.
Downloading data
Functions
- Now simplified, a lot faster. Copy paste second.
op_baseGetData <- function(db, ident) { obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep=''))[1,1] series_id <- sqlQuery(db, paste('SELECT series_id FROM actobj WHERE obj_id = ', obj_id, ' ORDER BY series_id DESC LIMIT 1'))[1,1] Data <- sqlQuery(db, paste('SELECT cell.id, res.obs, obj.name AS ind, loc.location AS loc, res.result', ' FROM actobj LEFT JOIN cell ON actobj.id = cell.actobj_id LEFT JOIN res ON cell.id =', ' res.cell_id LEFT JOIN loccell ON cell.id = loccell.cell_id LEFT JOIN loc ON loccell.loc_id', ' = loc.id LEFT JOIN obj ON loc.obj_id_i = obj.id WHERE actobj.obj_id = ', obj_id, ' AND actobj.series_id = ', series_id, ' ORDER BY id, obs, ind', sep = '')) nind <- length(levels(Data[,3])) nres <- nrow(Data)/nind dataframe <- Data[1:nres*nind, c(1,2)] for (i in nind:1) { dataframe[,2 + nind - i + 1] <- Data[1:nres*nind - i + 1, 4] colnames(dataframe)[2 + nind - i + 1] <- as.character(Data[nind - i + 1, 3]) } dataframe[,3 + nind] <- Data[1:nres*nind, 5] colnames(dataframe)[3 + nind] <- "Result" rownames(dataframe) <- 1:nres dataframe }
Usage
variable <- op_baseGetData(db, "page identifier")
- Result will be in a table format with columns: id, obs, ind1, ind2 ... indn, Result.
Manipulating data
Functions
DataframeToArray <- function(dataframe) { DimNames <- rep(vector("list", 1), ncol(dataframe) - 2) names(DimNames) <- c(colnames(dataframe)[2:(ncol(dataframe) - 1)]) nind <- length(names(DimNames)) indlengths <- 0 for (i in 1:nind) { DimNames[[i]] <- levels(factor(dataframe[,1 + i])) indlengths[i] <- length(DimNames[[i]]) } array <- rep(NA, prod(indlengths)) dim(array) <- indlengths dimnames(array) <- DimNames array[as.matrix(dataframe[,c(1:nind + 1)])] <- dataframe[,ncol(dataframe)] array }
Usage
variable2 <- DataframeToArray(variable1)
- variable1 must be in similar format as the result when downloading.
- first column (id) is ignored
- variable1 <- function(variable1) is allowed in R
Other useful stuff
as.data.frame(as.table(array)) dataframe[is.na(dataframe[,ncol(dataframe)])==FALSE,]
- First line returns an array from a data frame.
- Second line returns rows without empty values.
dataframe[grep("location", dataframe$index, ignore.case = TRUE),]
- Returns all rows of a data frame where column "index" value is "location".
dataframe[order(dataframe[,"col1"],dataframe[,"col2"], ... ,dataframe[,"coln"]),]
- Returns dataframe ordered by col1, col2, ... , coln.
read.csv("table.csv", sep = ",")
- Returns a data.frame from a .csv file, sep is the separator used in the file.
Uploading data
- Tested and works.
- Very efficient.
- Probabilistic data supported, albeit with following weaknesses:
- Varying numbers of iterations between cells not supported.
- Probabilistic data must also follow a certain sorting order (indices first, obs last).
- Uploading indices and locations relies on updating a dummy column when a duplicate is encountered.
- Those dummy columns are "newest" and "roww" for indices and locations respectively, which are obsolete anyway.
- Using a dummy column is probably faster and simpler than some other workaround.
Functions
op_baseWrite <- function(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL, acttype = NULL) { #Coerce input into a data frame if it isn't one already; get rid of empty cells if (is.array(input)) dataframe <- as.data.frame(as.table(input)) else dataframe <- input rescol <- colnames(dataframe) == "Freq" if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Freq" else { rescol <- colnames(dataframe) == "Result" if (length(rescol[rescol==TRUE]) >= 1) rescol <- "Result" else { rescol <- colnames(dataframe) == "result" if (length(rescol[rescol==TRUE]) >= 1) rescol <- "result" } } values <- dataframe[,rescol] cond <- (is.na(values) == FALSE) dataframe <- dataframe[cond,] #Add page to database (if it doesn't already exist) if (is.null(ident)==TRUE) if (interactive()) ident <- readline(paste("What is the identifier of this object?", "\n", sep = "")) else stop("indentifier of object no given") wiki_name <- substring(ident, 1, 5) if (wiki_name=="Op_en") {wiki_id <- 1; page <- substring(ident, 6, nchar(ident))} else if (wiki_name=="Op_fi") { wiki_id <- 2; page <- substring(ident, 6, nchar(ident))} else {wiki_id <- 0; page <- 0} obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1] if (is.na(obj_id)==TRUE) { if (is.null(name)==TRUE) if (interactive()) name <- readline(paste("What is the name of this object?", "\n", sep = "")) else stop("object name not given") if (is.null(unit)==TRUE) if (interactive()) unit <- readline(paste("What is the unit of this object?", "\n", sep = "")) else stop("unit not given") if (is.null(objtype_id)==TRUE) if (interactive()) objtype_id <- readline(paste("What type of object is", " this (id)?", paste(paste(sqlQuery(db, "SELECT id FROM objtype")[,1], sqlQuery(db, "SELECT objtype", " FROM objtype")[,1], sep = " - "), collapse = ", "), "\n", collapse = " ")) else { stop("object type not given")} sqlQuery(db, paste('INSERT INTO obj (ident, name, unit, objtype_id, page, wiki_id) VALUES ("', paste(ident, name, unit, sep = '","'), '",', paste(objtype_id, page, wiki_id, sep = ','), ')', sep = '')) obj_id <- sqlQuery(db, paste('SELECT id FROM obj WHERE ident = "', ident, '"', sep = ''))[1,1] } #Write act and actobj if (is.null(who)==TRUE) if (interactive()) {who <- readline(paste("What is the name of the uploader?", "\n", sep = "")) } else stop("uploader name not given") series_id <- sqlQuery(db, paste("SELECT series_id FROM actobj WHERE obj_id = ", obj_id, " ORDER BY series_id DESC LIMIT 1", sep = ""))[1,1] if (is.na(series_id)==FALSE) {if (is.null(acttype)==TRUE) if (interactive()) {acttype <- readline(paste("What type of upload", " is this? 4 - new data to replace any existing, 5 - new data to be appended to existing data (must have the same", " indices).", "\n", sep = "")) } else acctype <- 4 } else acctype <- 4 if (acttype != 4|5) stop ("proper acttype not given") sqlQuery(db, paste('INSERT INTO act (acttype_id, who, comments) VALUES (', acttype, ',"', who, '","R upload")', sep = '')) act_id <- sqlQuery(db, "SELECT id FROM act ORDER BY id DESC LIMIT 1")[1,1] if (acttype == 4) series_id <- act_id sqlQuery(db, paste('INSERT INTO actobj (act_id, obj_id, series_id) VALUES (', paste(act_id, obj_id, series_id, sep = ','), ')', sep = '')) actobj_id <- sqlQuery(db, paste('SELECT id FROM actobj WHERE act_id = ', act_id, sep = ''))[1,1] #Write indexes ColNames <- colnames(dataframe[(colnames(dataframe) == "id") == FALSE]) ColNames <- ColNames[(ColNames == "obs") == FALSE] ColNames <- ColNames[(ColNames == "Result") == FALSE] ColNames <- ColNames[(ColNames == "result") == FALSE] ColNames <- ColNames[(ColNames == "Freq") == FALSE] nind <- length(ColNames) DimNames <- rep(vector("list", 1), nind) names(DimNames) <- ColNames indlengths <- 0 for (i in 1:nind) { DimNames[[i]] <- levels(factor(dataframe[, ColNames[i]])) indlengths[i] <- length(DimNames[[i]]) } indQuery <- NA indQuery[1:nind*5 - 4] <- gsub(" ", "_", ColNames) indQuery[1:nind*5 - 3] <- '","' indQuery[1:nind*5 - 2] <- ColNames indQuery[1:nind*5 - 1] <- '",6' indQuery[1:(nind - 1)*5] <- '),("' sqlQuery(db, paste('INSERT INTO obj (ident, name, objtype_id) VALUES ("', paste(indQuery, collapse = ''), ')', ' ON DUPLICATE KEY UPDATE newest = 0', sep = '')) IndIds <- sqlQuery(db, paste('SELECT id, ident FROM obj WHERE ident IN("', paste(gsub(" ", "_", ColNames), collapse = '","'), '")', sep = '')) DimIds <- DimNames DimN <- 1:nind names(DimN) <- gsub(" ", "_", ColNames) for (i in 1:nrow(IndIds)) { names(DimIds)[DimN[IndIds[i,2]]] <- IndIds[grep(gsub(" ", "_", ColNames[i]), IndIds$ident), 1] } #Write locations y <- 1 LocNames <- as.data.frame(matrix(rep(NA, 2*sum(indlengths)), sum(indlengths), 2)) for (i in 1:nind) { LocNames[y:(y + indlengths[i] - 1),1:2] <- matrix(c(rep(ColNames[i], indlengths[i]), levels(factor(dataframe[,ColNames[i]]))), indlengths[i], 2) y <- y + indlengths[i] } LocNames[,1] <- names(DimIds)[DimN[LocNames[,1]]] nloc <- nrow(LocNames) locQuery <- NA locQuery[1:nloc*4 - 3] <- LocNames[,1] locQuery[1:nloc*4 - 2] <- ',"' locQuery[1:nloc*4 - 1] <- LocNames[,2] locQuery[1:(nloc - 1)*4] <- '"),(' sqlQuery(db, paste('INSERT INTO loc (obj_id_i, location) VALUES (', paste(locQuery, collapse = ''), '")', ' ON DUPLICATE KEY UPDATE roww = 0', sep = '')) LocIds <- sqlQuery(db, paste('SELECT id, obj_id_i, location FROM loc WHERE location IN("', paste(LocNames[,2], collapse = '","'), '") AND obj_id_i IN(', paste(names(DimIds), collapse = ','), ')', sep = '')) LocMap <- NA y <- 1 for (i in 1:nind) { LocMap <- LocIds[grep(names(DimIds)[i], LocIds$obj_id_i), 1] names(LocMap) <- LocIds[grep(names(DimIds)[i], LocIds$obj_id_i), 3] DimIds[[i]] <- LocMap[DimNames[[i]]] } #Changing location names in table into ids for (i in 1:nind) { dataframe[,ColNames[i]] <- factor(dataframe[,ColNames[i]]) levels(dataframe[,ColNames[i]]) <- DimIds[[i]] } #A hidden parameter for adjusting query packet sizes, the higher the faster, though crash becomes likelier maxrows <- 50000 #Writing cell obscol <- colnames(dataframe) == "obs" if (length(obscol[obscol==TRUE]) >= 1) { obscol <- TRUE n <- length(levels(factor(dataframe[,"obs"])))} else { obscol <- FALSE n <- 1 } ncell <- nrow(dataframe)/n cellQuery <- NA cellQuery[1:ncell*6-5] <- actobj_id cellQuery[1:ncell*6-4] <- "," if (n == 1) { cellQuery[1:ncell*6-3] <- dataframe[,rescol]} else { cellQuery[1:ncell*6-3] <- apply(matrix(dataframe[,rescol], n, ncell), 2, mean) } cellQuery[1:ncell*6-2] <- "," cellQuery[1:ncell*6-1] <- n cellQuery[1:(ncell-1)*6] <- "),(" if (ncell >= maxrows) { for (i in 1:(ncell%/%maxrows)) { sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[ ((i-1)*6*maxrows+1):(i*6*maxrows-1)], collapse = ''), ')', sep = '')) } } if (ncell%%maxrows != 0) sqlQuery(db, paste('INSERT INTO cell (actobj_id, mean, n) VALUES (', paste(cellQuery[(ncell%/% maxrows*6*maxrows+1):length(cellQuery)], collapse = ''), ')', sep = '')) #Writing res cell_id <- sqlQuery(db, paste("SELECT id FROM cell WHERE actobj_id = ", actobj_id, " ORDER BY ID", sep = ""))[,1] if (ncell != length(cell_id)) stop("Number of cells written does not match with expected value") resQuery <- NA resQuery[1:nrow(dataframe)*6-5] <- rep(cell_id, each = n) resQuery[1:nrow(dataframe)*6-4] <- "," if (n==1) resQuery[1:nrow(dataframe)*6-3] <- "0" else resQuery[1:nrow(dataframe)*6-3] <- dataframe[,"obs"] resQuery[1:nrow(dataframe)*6-2] <- "," resQuery[1:nrow(dataframe)*6-1] <- dataframe[,rescol] resQuery[1:(nrow(dataframe)-1)*6] <- "),(" if (nrow(dataframe) >= maxrows) { for (i in 1:(nrow(dataframe)%/%maxrows)) { sqlQuery(db, paste('INSERT INTO res (cell_id, obs, result) VALUES (', paste(resQuery[((i-1) *4*maxrows+1):(i*4*maxrows-1)], collapse = ''), ')', sep = '')) } } if (nrow(dataframe)%%maxrows != 0) sqlQuery(db, paste('INSERT INTO res (cell_id, obs, result) VALUES (', paste(resQuery[ (nrow(dataframe)%/%maxrows*4*maxrows+1):length(resQuery)], collapse = ''), ')', sep = '')) #Writing loccell locidmatrix <- rep(NA, nind*ncell) dim(locidmatrix) <- c(nind,ncell) for (i in 1:nind) { locidmatrix[i,] <- as.character(dataframe[1:ncell*n - n + 1,ColNames[i]]) } loccellQuery <- rep(NA, 4*nind*ncell) dim(loccellQuery) <- c(4*nind, ncell) loccellQuery[1:nind*4-3, 1:ncell] <- rep(cell_id, each = nind) loccellQuery[1:nind*4-2, 1:ncell] <- "," loccellQuery[1:nind*4-1, 1:ncell] <- locidmatrix loccellQuery[1:nind*4, 1:ncell] <- "),(" loccellQuery[nind*4, ncell] <- "" if (ncell >= maxrows) { for (i in 1:(ncell%/%maxrows)) { loccellQuery[nind*4, i*maxrows] <- "" sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[, ((i-1)*maxrows+1):(i*maxrows)], collapse = ''), ')', sep = '')) } } if (ncell%%maxrows != 0) sqlQuery(db, paste('INSERT INTO loccell (cell_id, loc_id) VALUES (', paste(loccellQuery[, (ncell%/%maxrows*maxrows+1):ncell], collapse = ''), ')', sep = '')) }
Usage
op_baseWrite(db, input, ident = NULL, name = NULL, unit = NULL, objtype_id = NULL, who = NULL, acttype = NULL)
- Db and input must be defined, the rest if not defined is prompted for by the function as needed.
Restrictions
- Input may be given in either array or data.frame form.
- Input in data.frame form must contain a "Freq", "Result" or "result" column, this is where the numerical value is read.
- Probabilistic input must contain a dimension (in case of array) or a column (in case of data.frame) named "obs". Also: all cells must have values for each "obs" value. And the sorting order must be: indices first, obs last; for arrays this means that the first index must be "obs".